|
a |
|
b/notebooks/resource-allocation/112023.ipynb |
|
|
1 |
{ |
|
|
2 |
"cells": [ |
|
|
3 |
{ |
|
|
4 |
"cell_type": "code", |
|
|
5 |
"execution_count": 1, |
|
|
6 |
"metadata": { |
|
|
7 |
"scrolled": false |
|
|
8 |
}, |
|
|
9 |
"outputs": [], |
|
|
10 |
"source": [ |
|
|
11 |
"import pandas as pd\n", |
|
|
12 |
"import seaborn as sns\n", |
|
|
13 |
"from datetime import datetime, timedelta, date\n", |
|
|
14 |
"from humanize import naturalsize\n", |
|
|
15 |
"import matplotlib.pyplot as plt\n", |
|
|
16 |
"import numpy as np\n", |
|
|
17 |
"\n", |
|
|
18 |
"%matplotlib inline" |
|
|
19 |
] |
|
|
20 |
}, |
|
|
21 |
{ |
|
|
22 |
"cell_type": "markdown", |
|
|
23 |
"metadata": {}, |
|
|
24 |
"source": [ |
|
|
25 |
"# Qiita's resource allocation\n", |
|
|
26 |
"\n", |
|
|
27 |
"This notebook walks through how to load & parse the job stats from Qiita. It additionally tries to split the different commands by their resource utilization and make sure to be as accurate/fair to request them. Here resource allocations are mainly walltime (`ElapsedRawTime`), memory (`MaxRSSRaw`) and the time a job took to start running (`WaitTime`: Start - Submit). " |
|
|
28 |
] |
|
|
29 |
}, |
|
|
30 |
{ |
|
|
31 |
"cell_type": "markdown", |
|
|
32 |
"metadata": {}, |
|
|
33 |
"source": [ |
|
|
34 |
"# Loading data\n", |
|
|
35 |
"\n", |
|
|
36 |
"First you will need to run `generate-allocation-summary.py` in Qiita as the qiita user (or whatever user runs qiita in your system). The resulting file would be: `job_[date].tsv.gz`.\n", |
|
|
37 |
"\n", |
|
|
38 |
"The generated file will have these columns: `['JobID', 'ElapsedRaw', 'MaxRSS', 'Submit', 'Start', 'MaxRSS.1', 'CPUTimeRAW', 'ReqMem', 'AllocCPUS', 'AveVMSize', 'QiitaID', 'external_id', 'sId', 'sName', 'sVersion', 'cId', 'cName', 'samples', 'columns', 'input_size', 'extra_info'],`." |
|
|
39 |
] |
|
|
40 |
}, |
|
|
41 |
{ |
|
|
42 |
"cell_type": "code", |
|
|
43 |
"execution_count": 2, |
|
|
44 |
"metadata": { |
|
|
45 |
"scrolled": true |
|
|
46 |
}, |
|
|
47 |
"outputs": [], |
|
|
48 |
"source": [ |
|
|
49 |
"m1g = 2**30\n", |
|
|
50 |
"df = pd.read_csv('jobs_2023-10-31.tsv.gz', sep='\\t', dtype={'extra_info': str})\n", |
|
|
51 |
"df['ElapsedRawTime'] = pd.to_timedelta(df.ElapsedRawTime)" |
|
|
52 |
] |
|
|
53 |
}, |
|
|
54 |
{ |
|
|
55 |
"cell_type": "code", |
|
|
56 |
"execution_count": 3, |
|
|
57 |
"metadata": {}, |
|
|
58 |
"outputs": [ |
|
|
59 |
{ |
|
|
60 |
"data": { |
|
|
61 |
"text/plain": [ |
|
|
62 |
"'There are 106548 successful jobs since we moved to barnacle2 and the largest external_id is: 1614116'" |
|
|
63 |
] |
|
|
64 |
}, |
|
|
65 |
"execution_count": 3, |
|
|
66 |
"metadata": {}, |
|
|
67 |
"output_type": "execute_result" |
|
|
68 |
} |
|
|
69 |
], |
|
|
70 |
"source": [ |
|
|
71 |
"# for reference for the next iteration of this notebook\n", |
|
|
72 |
"f'There are {df.shape[0]} successful jobs since we moved to barnacle2 and the largest external_id is: {df.external_id.max()}'" |
|
|
73 |
] |
|
|
74 |
}, |
|
|
75 |
{ |
|
|
76 |
"cell_type": "markdown", |
|
|
77 |
"metadata": {}, |
|
|
78 |
"source": [ |
|
|
79 |
"# Deciding what to optimize and what to leave with a default value\n", |
|
|
80 |
"\n", |
|
|
81 |
"In the previous versions (072023, 102023, 102023.1) we decided to only optimize things that are using more than 4gb or 4hrs and now we want to review commands that are below 4g and 4hrs so we add specific parameters for them." |
|
|
82 |
] |
|
|
83 |
}, |
|
|
84 |
{ |
|
|
85 |
"cell_type": "code", |
|
|
86 |
"execution_count": 4, |
|
|
87 |
"metadata": {}, |
|
|
88 |
"outputs": [ |
|
|
89 |
{ |
|
|
90 |
"name": "stdout", |
|
|
91 |
"output_type": "stream", |
|
|
92 |
"text": [ |
|
|
93 |
"qiita: 10\n" |
|
|
94 |
] |
|
|
95 |
}, |
|
|
96 |
{ |
|
|
97 |
"data": { |
|
|
98 |
"text/html": [ |
|
|
99 |
"<div>\n", |
|
|
100 |
"<style scoped>\n", |
|
|
101 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
102 |
" vertical-align: middle;\n", |
|
|
103 |
" }\n", |
|
|
104 |
"\n", |
|
|
105 |
" .dataframe tbody tr th {\n", |
|
|
106 |
" vertical-align: top;\n", |
|
|
107 |
" }\n", |
|
|
108 |
"\n", |
|
|
109 |
" .dataframe thead tr th {\n", |
|
|
110 |
" text-align: left;\n", |
|
|
111 |
" }\n", |
|
|
112 |
"\n", |
|
|
113 |
" .dataframe thead tr:last-of-type th {\n", |
|
|
114 |
" text-align: right;\n", |
|
|
115 |
" }\n", |
|
|
116 |
"</style>\n", |
|
|
117 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
118 |
" <thead>\n", |
|
|
119 |
" <tr>\n", |
|
|
120 |
" <th></th>\n", |
|
|
121 |
" <th></th>\n", |
|
|
122 |
" <th colspan=\"3\" halign=\"left\">ElapsedRawTime</th>\n", |
|
|
123 |
" <th colspan=\"2\" halign=\"left\">MaxRSSRaw</th>\n", |
|
|
124 |
" </tr>\n", |
|
|
125 |
" <tr>\n", |
|
|
126 |
" <th></th>\n", |
|
|
127 |
" <th></th>\n", |
|
|
128 |
" <th>count</th>\n", |
|
|
129 |
" <th>min</th>\n", |
|
|
130 |
" <th>max</th>\n", |
|
|
131 |
" <th>min</th>\n", |
|
|
132 |
" <th>max</th>\n", |
|
|
133 |
" </tr>\n", |
|
|
134 |
" <tr>\n", |
|
|
135 |
" <th>cName</th>\n", |
|
|
136 |
" <th>sName</th>\n", |
|
|
137 |
" <th></th>\n", |
|
|
138 |
" <th></th>\n", |
|
|
139 |
" <th></th>\n", |
|
|
140 |
" <th></th>\n", |
|
|
141 |
" <th></th>\n", |
|
|
142 |
" </tr>\n", |
|
|
143 |
" </thead>\n", |
|
|
144 |
" <tbody>\n", |
|
|
145 |
" <tr>\n", |
|
|
146 |
" <th>delete_artifact</th>\n", |
|
|
147 |
" <th>Qiita</th>\n", |
|
|
148 |
" <td>1534</td>\n", |
|
|
149 |
" <td>0 days 00:00:03</td>\n", |
|
|
150 |
" <td>0 days 02:48:08</td>\n", |
|
|
151 |
" <td>0 Bytes</td>\n", |
|
|
152 |
" <td>122.2 MB</td>\n", |
|
|
153 |
" </tr>\n", |
|
|
154 |
" <tr>\n", |
|
|
155 |
" <th>create_sample_template</th>\n", |
|
|
156 |
" <th>Qiita</th>\n", |
|
|
157 |
" <td>569</td>\n", |
|
|
158 |
" <td>0 days 00:00:03</td>\n", |
|
|
159 |
" <td>0 days 00:11:22</td>\n", |
|
|
160 |
" <td>0 Bytes</td>\n", |
|
|
161 |
" <td>415.8 MB</td>\n", |
|
|
162 |
" </tr>\n", |
|
|
163 |
" <tr>\n", |
|
|
164 |
" <th>delete_analysis</th>\n", |
|
|
165 |
" <th>Qiita</th>\n", |
|
|
166 |
" <td>320</td>\n", |
|
|
167 |
" <td>0 days 00:00:03</td>\n", |
|
|
168 |
" <td>0 days 00:06:13</td>\n", |
|
|
169 |
" <td>0 Bytes</td>\n", |
|
|
170 |
" <td>120.8 MB</td>\n", |
|
|
171 |
" </tr>\n", |
|
|
172 |
" <tr>\n", |
|
|
173 |
" <th>download_remote_files</th>\n", |
|
|
174 |
" <th>Qiita</th>\n", |
|
|
175 |
" <td>194</td>\n", |
|
|
176 |
" <td>0 days 00:00:07</td>\n", |
|
|
177 |
" <td>0 days 03:29:36</td>\n", |
|
|
178 |
" <td>0 Bytes</td>\n", |
|
|
179 |
" <td>128.9 MB</td>\n", |
|
|
180 |
" </tr>\n", |
|
|
181 |
" <tr>\n", |
|
|
182 |
" <th>delete_sample_template</th>\n", |
|
|
183 |
" <th>Qiita</th>\n", |
|
|
184 |
" <td>181</td>\n", |
|
|
185 |
" <td>0 days 00:00:04</td>\n", |
|
|
186 |
" <td>0 days 00:19:31</td>\n", |
|
|
187 |
" <td>0 Bytes</td>\n", |
|
|
188 |
" <td>120.6 MB</td>\n", |
|
|
189 |
" </tr>\n", |
|
|
190 |
" <tr>\n", |
|
|
191 |
" <th>delete_study</th>\n", |
|
|
192 |
" <th>Qiita</th>\n", |
|
|
193 |
" <td>136</td>\n", |
|
|
194 |
" <td>0 days 00:00:03</td>\n", |
|
|
195 |
" <td>0 days 00:16:09</td>\n", |
|
|
196 |
" <td>0 Bytes</td>\n", |
|
|
197 |
" <td>125.5 MB</td>\n", |
|
|
198 |
" </tr>\n", |
|
|
199 |
" <tr>\n", |
|
|
200 |
" <th>update_prep_template</th>\n", |
|
|
201 |
" <th>Qiita</th>\n", |
|
|
202 |
" <td>126</td>\n", |
|
|
203 |
" <td>0 days 00:00:03</td>\n", |
|
|
204 |
" <td>0 days 00:02:25</td>\n", |
|
|
205 |
" <td>0 Bytes</td>\n", |
|
|
206 |
" <td>125.3 MB</td>\n", |
|
|
207 |
" </tr>\n", |
|
|
208 |
" <tr>\n", |
|
|
209 |
" <th>copy_artifact</th>\n", |
|
|
210 |
" <th>Qiita</th>\n", |
|
|
211 |
" <td>101</td>\n", |
|
|
212 |
" <td>0 days 00:00:06</td>\n", |
|
|
213 |
" <td>0 days 00:33:16</td>\n", |
|
|
214 |
" <td>0 Bytes</td>\n", |
|
|
215 |
" <td>124.1 MB</td>\n", |
|
|
216 |
" </tr>\n", |
|
|
217 |
" <tr>\n", |
|
|
218 |
" <th>Generate HTML summary</th>\n", |
|
|
219 |
" <th>Sequencing Data Type</th>\n", |
|
|
220 |
" <td>78</td>\n", |
|
|
221 |
" <td>0 days 00:00:35</td>\n", |
|
|
222 |
" <td>0 days 02:18:54</td>\n", |
|
|
223 |
" <td>56.6 MB</td>\n", |
|
|
224 |
" <td>85.7 MB</td>\n", |
|
|
225 |
" </tr>\n", |
|
|
226 |
" <tr>\n", |
|
|
227 |
" <th>list_remote_files</th>\n", |
|
|
228 |
" <th>Qiita</th>\n", |
|
|
229 |
" <td>47</td>\n", |
|
|
230 |
" <td>0 days 00:00:05</td>\n", |
|
|
231 |
" <td>0 days 00:02:21</td>\n", |
|
|
232 |
" <td>0 Bytes</td>\n", |
|
|
233 |
" <td>121.7 MB</td>\n", |
|
|
234 |
" </tr>\n", |
|
|
235 |
" </tbody>\n", |
|
|
236 |
"</table>\n", |
|
|
237 |
"</div>" |
|
|
238 |
], |
|
|
239 |
"text/plain": [ |
|
|
240 |
" ElapsedRawTime \\\n", |
|
|
241 |
" count min \n", |
|
|
242 |
"cName sName \n", |
|
|
243 |
"delete_artifact Qiita 1534 0 days 00:00:03 \n", |
|
|
244 |
"create_sample_template Qiita 569 0 days 00:00:03 \n", |
|
|
245 |
"delete_analysis Qiita 320 0 days 00:00:03 \n", |
|
|
246 |
"download_remote_files Qiita 194 0 days 00:00:07 \n", |
|
|
247 |
"delete_sample_template Qiita 181 0 days 00:00:04 \n", |
|
|
248 |
"delete_study Qiita 136 0 days 00:00:03 \n", |
|
|
249 |
"update_prep_template Qiita 126 0 days 00:00:03 \n", |
|
|
250 |
"copy_artifact Qiita 101 0 days 00:00:06 \n", |
|
|
251 |
"Generate HTML summary Sequencing Data Type 78 0 days 00:00:35 \n", |
|
|
252 |
"list_remote_files Qiita 47 0 days 00:00:05 \n", |
|
|
253 |
"\n", |
|
|
254 |
" MaxRSSRaw \\\n", |
|
|
255 |
" max min \n", |
|
|
256 |
"cName sName \n", |
|
|
257 |
"delete_artifact Qiita 0 days 02:48:08 0 Bytes \n", |
|
|
258 |
"create_sample_template Qiita 0 days 00:11:22 0 Bytes \n", |
|
|
259 |
"delete_analysis Qiita 0 days 00:06:13 0 Bytes \n", |
|
|
260 |
"download_remote_files Qiita 0 days 03:29:36 0 Bytes \n", |
|
|
261 |
"delete_sample_template Qiita 0 days 00:19:31 0 Bytes \n", |
|
|
262 |
"delete_study Qiita 0 days 00:16:09 0 Bytes \n", |
|
|
263 |
"update_prep_template Qiita 0 days 00:02:25 0 Bytes \n", |
|
|
264 |
"copy_artifact Qiita 0 days 00:33:16 0 Bytes \n", |
|
|
265 |
"Generate HTML summary Sequencing Data Type 0 days 02:18:54 56.6 MB \n", |
|
|
266 |
"list_remote_files Qiita 0 days 00:02:21 0 Bytes \n", |
|
|
267 |
"\n", |
|
|
268 |
" \n", |
|
|
269 |
" max \n", |
|
|
270 |
"cName sName \n", |
|
|
271 |
"delete_artifact Qiita 122.2 MB \n", |
|
|
272 |
"create_sample_template Qiita 415.8 MB \n", |
|
|
273 |
"delete_analysis Qiita 120.8 MB \n", |
|
|
274 |
"download_remote_files Qiita 128.9 MB \n", |
|
|
275 |
"delete_sample_template Qiita 120.6 MB \n", |
|
|
276 |
"delete_study Qiita 125.5 MB \n", |
|
|
277 |
"update_prep_template Qiita 125.3 MB \n", |
|
|
278 |
"copy_artifact Qiita 124.1 MB \n", |
|
|
279 |
"Generate HTML summary Sequencing Data Type 85.7 MB \n", |
|
|
280 |
"list_remote_files Qiita 121.7 MB " |
|
|
281 |
] |
|
|
282 |
}, |
|
|
283 |
"metadata": {}, |
|
|
284 |
"output_type": "display_data" |
|
|
285 |
}, |
|
|
286 |
{ |
|
|
287 |
"name": "stdout", |
|
|
288 |
"output_type": "stream", |
|
|
289 |
"text": [ |
|
|
290 |
"qiime2: 8\n" |
|
|
291 |
] |
|
|
292 |
}, |
|
|
293 |
{ |
|
|
294 |
"data": { |
|
|
295 |
"text/html": [ |
|
|
296 |
"<div>\n", |
|
|
297 |
"<style scoped>\n", |
|
|
298 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
299 |
" vertical-align: middle;\n", |
|
|
300 |
" }\n", |
|
|
301 |
"\n", |
|
|
302 |
" .dataframe tbody tr th {\n", |
|
|
303 |
" vertical-align: top;\n", |
|
|
304 |
" }\n", |
|
|
305 |
"\n", |
|
|
306 |
" .dataframe thead tr th {\n", |
|
|
307 |
" text-align: left;\n", |
|
|
308 |
" }\n", |
|
|
309 |
"\n", |
|
|
310 |
" .dataframe thead tr:last-of-type th {\n", |
|
|
311 |
" text-align: right;\n", |
|
|
312 |
" }\n", |
|
|
313 |
"</style>\n", |
|
|
314 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
315 |
" <thead>\n", |
|
|
316 |
" <tr>\n", |
|
|
317 |
" <th></th>\n", |
|
|
318 |
" <th></th>\n", |
|
|
319 |
" <th colspan=\"3\" halign=\"left\">ElapsedRawTime</th>\n", |
|
|
320 |
" <th colspan=\"2\" halign=\"left\">MaxRSSRaw</th>\n", |
|
|
321 |
" </tr>\n", |
|
|
322 |
" <tr>\n", |
|
|
323 |
" <th></th>\n", |
|
|
324 |
" <th></th>\n", |
|
|
325 |
" <th>count</th>\n", |
|
|
326 |
" <th>min</th>\n", |
|
|
327 |
" <th>max</th>\n", |
|
|
328 |
" <th>min</th>\n", |
|
|
329 |
" <th>max</th>\n", |
|
|
330 |
" </tr>\n", |
|
|
331 |
" <tr>\n", |
|
|
332 |
" <th>cName</th>\n", |
|
|
333 |
" <th>sName</th>\n", |
|
|
334 |
" <th></th>\n", |
|
|
335 |
" <th></th>\n", |
|
|
336 |
" <th></th>\n", |
|
|
337 |
" <th></th>\n", |
|
|
338 |
" <th></th>\n", |
|
|
339 |
" </tr>\n", |
|
|
340 |
" </thead>\n", |
|
|
341 |
" <tbody>\n", |
|
|
342 |
" <tr>\n", |
|
|
343 |
" <th>adonis PERMANOVA test for beta group significance [adonis]</th>\n", |
|
|
344 |
" <th>qiime2</th>\n", |
|
|
345 |
" <td>552</td>\n", |
|
|
346 |
" <td>0 days 00:00:57</td>\n", |
|
|
347 |
" <td>0 days 00:39:12</td>\n", |
|
|
348 |
" <td>147.4 MB</td>\n", |
|
|
349 |
" <td>3.5 GB</td>\n", |
|
|
350 |
" </tr>\n", |
|
|
351 |
" <tr>\n", |
|
|
352 |
" <th>Core diversity metrics (non-phylogenetic) [core_metrics]</th>\n", |
|
|
353 |
" <th>qiime2</th>\n", |
|
|
354 |
" <td>100</td>\n", |
|
|
355 |
" <td>0 days 00:02:17</td>\n", |
|
|
356 |
" <td>0 days 00:25:31</td>\n", |
|
|
357 |
" <td>213.5 MB</td>\n", |
|
|
358 |
" <td>4.3 GB</td>\n", |
|
|
359 |
" </tr>\n", |
|
|
360 |
" <tr>\n", |
|
|
361 |
" <th>Taxonomy-based feature table filter. [filter_table]</th>\n", |
|
|
362 |
" <th>qiime2</th>\n", |
|
|
363 |
" <td>74</td>\n", |
|
|
364 |
" <td>0 days 00:00:52</td>\n", |
|
|
365 |
" <td>0 days 00:19:37</td>\n", |
|
|
366 |
" <td>214.9 MB</td>\n", |
|
|
367 |
" <td>2.6 GB</td>\n", |
|
|
368 |
" </tr>\n", |
|
|
369 |
" <tr>\n", |
|
|
370 |
" <th>Summarize table [summarize]</th>\n", |
|
|
371 |
" <th>qiime2</th>\n", |
|
|
372 |
" <td>64</td>\n", |
|
|
373 |
" <td>0 days 00:00:56</td>\n", |
|
|
374 |
" <td>0 days 00:05:54</td>\n", |
|
|
375 |
" <td>229.8 MB</td>\n", |
|
|
376 |
" <td>3.0 GB</td>\n", |
|
|
377 |
" </tr>\n", |
|
|
378 |
" <tr>\n", |
|
|
379 |
" <th>Add pseudocount to table. [add_pseudocount]</th>\n", |
|
|
380 |
" <th>qiime2</th>\n", |
|
|
381 |
" <td>55</td>\n", |
|
|
382 |
" <td>0 days 00:01:04</td>\n", |
|
|
383 |
" <td>0 days 00:06:14</td>\n", |
|
|
384 |
" <td>242.5 MB</td>\n", |
|
|
385 |
" <td>2.9 GB</td>\n", |
|
|
386 |
" </tr>\n", |
|
|
387 |
" <tr>\n", |
|
|
388 |
" <th>Filter features from a table based on abundance and prevalence [filter_features_conditionally]</th>\n", |
|
|
389 |
" <th>qiime2</th>\n", |
|
|
390 |
" <td>53</td>\n", |
|
|
391 |
" <td>0 days 00:00:53</td>\n", |
|
|
392 |
" <td>0 days 00:02:33</td>\n", |
|
|
393 |
" <td>212.4 MB</td>\n", |
|
|
394 |
" <td>553.3 MB</td>\n", |
|
|
395 |
" </tr>\n", |
|
|
396 |
" <tr>\n", |
|
|
397 |
" <th>Identify core features in table [core_features]</th>\n", |
|
|
398 |
" <th>qiime2</th>\n", |
|
|
399 |
" <td>49</td>\n", |
|
|
400 |
" <td>0 days 00:01:03</td>\n", |
|
|
401 |
" <td>0 days 00:59:29</td>\n", |
|
|
402 |
" <td>212.9 MB</td>\n", |
|
|
403 |
" <td>2.6 GB</td>\n", |
|
|
404 |
" </tr>\n", |
|
|
405 |
" <tr>\n", |
|
|
406 |
" <th>Filter features from table [filter_features]</th>\n", |
|
|
407 |
" <th>qiime2</th>\n", |
|
|
408 |
" <td>48</td>\n", |
|
|
409 |
" <td>0 days 00:00:47</td>\n", |
|
|
410 |
" <td>0 days 00:03:34</td>\n", |
|
|
411 |
" <td>208.3 MB</td>\n", |
|
|
412 |
" <td>398.4 MB</td>\n", |
|
|
413 |
" </tr>\n", |
|
|
414 |
" </tbody>\n", |
|
|
415 |
"</table>\n", |
|
|
416 |
"</div>" |
|
|
417 |
], |
|
|
418 |
"text/plain": [ |
|
|
419 |
" ElapsedRawTime \\\n", |
|
|
420 |
" count \n", |
|
|
421 |
"cName sName \n", |
|
|
422 |
"adonis PERMANOVA test for beta group significan... qiime2 552 \n", |
|
|
423 |
"Core diversity metrics (non-phylogenetic) [core... qiime2 100 \n", |
|
|
424 |
"Taxonomy-based feature table filter. [filter_ta... qiime2 74 \n", |
|
|
425 |
"Summarize table [summarize] qiime2 64 \n", |
|
|
426 |
"Add pseudocount to table. [add_pseudocount] qiime2 55 \n", |
|
|
427 |
"Filter features from a table based on abundance... qiime2 53 \n", |
|
|
428 |
"Identify core features in table [core_features] qiime2 49 \n", |
|
|
429 |
"Filter features from table [filter_features] qiime2 48 \n", |
|
|
430 |
"\n", |
|
|
431 |
" \\\n", |
|
|
432 |
" min \n", |
|
|
433 |
"cName sName \n", |
|
|
434 |
"adonis PERMANOVA test for beta group significan... qiime2 0 days 00:00:57 \n", |
|
|
435 |
"Core diversity metrics (non-phylogenetic) [core... qiime2 0 days 00:02:17 \n", |
|
|
436 |
"Taxonomy-based feature table filter. [filter_ta... qiime2 0 days 00:00:52 \n", |
|
|
437 |
"Summarize table [summarize] qiime2 0 days 00:00:56 \n", |
|
|
438 |
"Add pseudocount to table. [add_pseudocount] qiime2 0 days 00:01:04 \n", |
|
|
439 |
"Filter features from a table based on abundance... qiime2 0 days 00:00:53 \n", |
|
|
440 |
"Identify core features in table [core_features] qiime2 0 days 00:01:03 \n", |
|
|
441 |
"Filter features from table [filter_features] qiime2 0 days 00:00:47 \n", |
|
|
442 |
"\n", |
|
|
443 |
" \\\n", |
|
|
444 |
" max \n", |
|
|
445 |
"cName sName \n", |
|
|
446 |
"adonis PERMANOVA test for beta group significan... qiime2 0 days 00:39:12 \n", |
|
|
447 |
"Core diversity metrics (non-phylogenetic) [core... qiime2 0 days 00:25:31 \n", |
|
|
448 |
"Taxonomy-based feature table filter. [filter_ta... qiime2 0 days 00:19:37 \n", |
|
|
449 |
"Summarize table [summarize] qiime2 0 days 00:05:54 \n", |
|
|
450 |
"Add pseudocount to table. [add_pseudocount] qiime2 0 days 00:06:14 \n", |
|
|
451 |
"Filter features from a table based on abundance... qiime2 0 days 00:02:33 \n", |
|
|
452 |
"Identify core features in table [core_features] qiime2 0 days 00:59:29 \n", |
|
|
453 |
"Filter features from table [filter_features] qiime2 0 days 00:03:34 \n", |
|
|
454 |
"\n", |
|
|
455 |
" MaxRSSRaw \n", |
|
|
456 |
" min max \n", |
|
|
457 |
"cName sName \n", |
|
|
458 |
"adonis PERMANOVA test for beta group significan... qiime2 147.4 MB 3.5 GB \n", |
|
|
459 |
"Core diversity metrics (non-phylogenetic) [core... qiime2 213.5 MB 4.3 GB \n", |
|
|
460 |
"Taxonomy-based feature table filter. [filter_ta... qiime2 214.9 MB 2.6 GB \n", |
|
|
461 |
"Summarize table [summarize] qiime2 229.8 MB 3.0 GB \n", |
|
|
462 |
"Add pseudocount to table. [add_pseudocount] qiime2 242.5 MB 2.9 GB \n", |
|
|
463 |
"Filter features from a table based on abundance... qiime2 212.4 MB 553.3 MB \n", |
|
|
464 |
"Identify core features in table [core_features] qiime2 212.9 MB 2.6 GB \n", |
|
|
465 |
"Filter features from table [filter_features] qiime2 208.3 MB 398.4 MB " |
|
|
466 |
] |
|
|
467 |
}, |
|
|
468 |
"metadata": {}, |
|
|
469 |
"output_type": "display_data" |
|
|
470 |
} |
|
|
471 |
], |
|
|
472 |
"source": [ |
|
|
473 |
"summary = df.groupby(['cName', 'sName'])[\n", |
|
|
474 |
" ['ElapsedRawTime', 'MaxRSSRaw']].agg(['count', 'min', 'max']).copy()\n", |
|
|
475 |
"\n", |
|
|
476 |
"# We are gonna focus on jobs that request more than 4gb or take more than 4 hrs.\n", |
|
|
477 |
"summary = summary[(summary[('MaxRSSRaw', 'max')] < 4*m1g) & \n", |
|
|
478 |
" (summary[('ElapsedRawTime', 'max')] < timedelta(hours=4))]\n", |
|
|
479 |
"\n", |
|
|
480 |
"summary.sort_values(('MaxRSSRaw', 'count'), inplace=True, ascending=False)\n", |
|
|
481 |
"summary.drop(columns=[('MaxRSSRaw', 'count')], inplace=True)\n", |
|
|
482 |
"\n", |
|
|
483 |
"# ignore commands with less than 40 jobs to avoid over fitting early\n", |
|
|
484 |
"summary = summary[summary[('ElapsedRawTime', 'count')] > 40]\n", |
|
|
485 |
"\n", |
|
|
486 |
"# ignore commands that were optimized on the previous notebooks - as part of larger sets\n", |
|
|
487 |
"# summary = summary[]\n", |
|
|
488 |
"summary = summary[summary.index.get_level_values('cName') != 'Validate']\n", |
|
|
489 |
"\n", |
|
|
490 |
"summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)\n", |
|
|
491 |
"summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)\n", |
|
|
492 |
"\n", |
|
|
493 |
"_df = summary[summary.index.get_level_values('sName') != 'qiime2']\n", |
|
|
494 |
"print (\"qiita:\", _df.shape[0])\n", |
|
|
495 |
"display(_df)\n", |
|
|
496 |
"\n", |
|
|
497 |
"_df = summary[summary.index.get_level_values('sName') == 'qiime2']\n", |
|
|
498 |
"print (\"qiime2:\", _df.shape[0])\n", |
|
|
499 |
"display(_df)\n", |
|
|
500 |
"\n", |
|
|
501 |
"# *** RESOURCE ALLOCATION ***\n", |
|
|
502 |
"\n", |
|
|
503 |
"# Qiita jobs \n", |
|
|
504 |
"# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) \n", |
|
|
505 |
"# VALUES \n", |
|
|
506 |
"# ('delete_artifact', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 3:00:00'),\n", |
|
|
507 |
"# ('create_sample_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 600mb --time 00:20:00'),\n", |
|
|
508 |
"# ('delete_analysis', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:10:00'),\n", |
|
|
509 |
"# ('download_remote_files', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 4:00:00'),\n", |
|
|
510 |
"# ('delete_sample_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:30:00'),\n", |
|
|
511 |
"# ('delete_study', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:30:00'),\n", |
|
|
512 |
"# ('update_prep_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:05:00'),\n", |
|
|
513 |
"# ('copy_artifact', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 1:00:00'),\n", |
|
|
514 |
"# ('list_remote_files', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:05:00');\n", |
|
|
515 |
"\n", |
|
|
516 |
"# Q2 jobs\n", |
|
|
517 |
"# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) \n", |
|
|
518 |
"# VALUES \n", |
|
|
519 |
"# ('adonis PERMANOVA test for beta group significance [adonis]', 'RESOURCE_PARAMS_COMMAND', \n", |
|
|
520 |
"# '-p qiita -N 1 -n 1 --mem 4gb --time 4:00:00'),\n", |
|
|
521 |
"# ('Core diversity metrics (non-phylogenetic) [core_metrics]', 'RESOURCE_PARAMS_COMMAND', \n", |
|
|
522 |
"# '-p qiita -N 1 -n 1 --mem 6gb --time 1:00:00'),\n", |
|
|
523 |
"# ('Taxonomy-based feature table filter. [filter_table]', 'RESOURCE_PARAMS_COMMAND', \n", |
|
|
524 |
"# '-p qiita -N 1 -n 1 --mem 4gb --time 00:20:00'),\n", |
|
|
525 |
"# ('Summarize table [summarize]', 'RESOURCE_PARAMS_COMMAND', \n", |
|
|
526 |
"# '-p qiita -N 1 -n 1 --mem 4gb --time 00:10:00'),\n", |
|
|
527 |
"# ('Add pseudocount to table. [add_pseudocount]', 'RESOURCE_PARAMS_COMMAND', \n", |
|
|
528 |
"# '-p qiita -N 1 -n 1 --mem 3.5gb --time 00:15:00'),\n", |
|
|
529 |
"# ('Filter features from a table based on abundance and prevalence [filter_features_conditionally]', \n", |
|
|
530 |
"# 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 1gb --time 00:10:00'),\n", |
|
|
531 |
"# ('Identify core features in table [core_features]', 'RESOURCE_PARAMS_COMMAND', \n", |
|
|
532 |
"# '-p qiita -N 1 -n 1 --mem 3.5gb --time 2:00:00'),\n", |
|
|
533 |
"# ('Filter features from table [filter_features]', 'RESOURCE_PARAMS_COMMAND', \n", |
|
|
534 |
"# '-p qiita -N 1 -n 1 --mem 500mb --time 00:10:00'); " |
|
|
535 |
] |
|
|
536 |
}, |
|
|
537 |
{ |
|
|
538 |
"cell_type": "markdown", |
|
|
539 |
"metadata": {}, |
|
|
540 |
"source": [ |
|
|
541 |
"## Optimizing Qiita processing jobs.\n", |
|
|
542 |
"\n", |
|
|
543 |
"As a remider, we can use:\n", |
|
|
544 |
"- 'samples'\n", |
|
|
545 |
"- 'columns'\n", |
|
|
546 |
"- 'input_size'\n", |
|
|
547 |
"- 'extra_info': this is when the current method doesn't provide the required info or we need to update it; this info comes from `job_stats_generation.py`\n", |
|
|
548 |
"\n", |
|
|
549 |
"Extra from the list of commands we should take a close look at `Generate HTML summary`" |
|
|
550 |
] |
|
|
551 |
}, |
|
|
552 |
{ |
|
|
553 |
"cell_type": "markdown", |
|
|
554 |
"metadata": {}, |
|
|
555 |
"source": [ |
|
|
556 |
"#### Generate HTML summary" |
|
|
557 |
] |
|
|
558 |
}, |
|
|
559 |
{ |
|
|
560 |
"cell_type": "code", |
|
|
561 |
"execution_count": 5, |
|
|
562 |
"metadata": {}, |
|
|
563 |
"outputs": [ |
|
|
564 |
{ |
|
|
565 |
"data": { |
|
|
566 |
"text/html": [ |
|
|
567 |
"<div>\n", |
|
|
568 |
"<style scoped>\n", |
|
|
569 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
570 |
" vertical-align: middle;\n", |
|
|
571 |
" }\n", |
|
|
572 |
"\n", |
|
|
573 |
" .dataframe tbody tr th {\n", |
|
|
574 |
" vertical-align: top;\n", |
|
|
575 |
" }\n", |
|
|
576 |
"\n", |
|
|
577 |
" .dataframe thead tr th {\n", |
|
|
578 |
" text-align: left;\n", |
|
|
579 |
" }\n", |
|
|
580 |
"\n", |
|
|
581 |
" .dataframe thead tr:last-of-type th {\n", |
|
|
582 |
" text-align: right;\n", |
|
|
583 |
" }\n", |
|
|
584 |
"</style>\n", |
|
|
585 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
586 |
" <thead>\n", |
|
|
587 |
" <tr>\n", |
|
|
588 |
" <th></th>\n", |
|
|
589 |
" <th></th>\n", |
|
|
590 |
" <th></th>\n", |
|
|
591 |
" <th colspan=\"3\" halign=\"left\">ElapsedRawTime</th>\n", |
|
|
592 |
" <th colspan=\"2\" halign=\"left\">MaxRSSRaw</th>\n", |
|
|
593 |
" <th colspan=\"2\" halign=\"left\">WaitTime</th>\n", |
|
|
594 |
" </tr>\n", |
|
|
595 |
" <tr>\n", |
|
|
596 |
" <th></th>\n", |
|
|
597 |
" <th></th>\n", |
|
|
598 |
" <th></th>\n", |
|
|
599 |
" <th>count</th>\n", |
|
|
600 |
" <th>min</th>\n", |
|
|
601 |
" <th>max</th>\n", |
|
|
602 |
" <th>min</th>\n", |
|
|
603 |
" <th>max</th>\n", |
|
|
604 |
" <th>min</th>\n", |
|
|
605 |
" <th>max</th>\n", |
|
|
606 |
" </tr>\n", |
|
|
607 |
" <tr>\n", |
|
|
608 |
" <th>cName</th>\n", |
|
|
609 |
" <th>sName</th>\n", |
|
|
610 |
" <th>extra_info</th>\n", |
|
|
611 |
" <th></th>\n", |
|
|
612 |
" <th></th>\n", |
|
|
613 |
" <th></th>\n", |
|
|
614 |
" <th></th>\n", |
|
|
615 |
" <th></th>\n", |
|
|
616 |
" <th></th>\n", |
|
|
617 |
" <th></th>\n", |
|
|
618 |
" </tr>\n", |
|
|
619 |
" </thead>\n", |
|
|
620 |
" <tbody>\n", |
|
|
621 |
" <tr>\n", |
|
|
622 |
" <th rowspan=\"2\" valign=\"top\">Generate HTML summary</th>\n", |
|
|
623 |
" <th>Sequencing Data Type</th>\n", |
|
|
624 |
" <th>NaN</th>\n", |
|
|
625 |
" <td>78</td>\n", |
|
|
626 |
" <td>0 days 00:00:35</td>\n", |
|
|
627 |
" <td>0 days 02:18:54</td>\n", |
|
|
628 |
" <td>56.6 MB</td>\n", |
|
|
629 |
" <td>85.7 MB</td>\n", |
|
|
630 |
" <td>0 days 00:00:00</td>\n", |
|
|
631 |
" <td>0 days 06:22:26</td>\n", |
|
|
632 |
" </tr>\n", |
|
|
633 |
" <tr>\n", |
|
|
634 |
" <th>BIOM type</th>\n", |
|
|
635 |
" <th>NaN</th>\n", |
|
|
636 |
" <td>2</td>\n", |
|
|
637 |
" <td>0 days 00:01:43</td>\n", |
|
|
638 |
" <td>0 days 00:02:23</td>\n", |
|
|
639 |
" <td>278.1 MB</td>\n", |
|
|
640 |
" <td>315.8 MB</td>\n", |
|
|
641 |
" <td>0 days 00:00:00</td>\n", |
|
|
642 |
" <td>0 days 00:00:01</td>\n", |
|
|
643 |
" </tr>\n", |
|
|
644 |
" </tbody>\n", |
|
|
645 |
"</table>\n", |
|
|
646 |
"</div>" |
|
|
647 |
], |
|
|
648 |
"text/plain": [ |
|
|
649 |
" ElapsedRawTime \\\n", |
|
|
650 |
" count \n", |
|
|
651 |
"cName sName extra_info \n", |
|
|
652 |
"Generate HTML summary Sequencing Data Type NaN 78 \n", |
|
|
653 |
" BIOM type NaN 2 \n", |
|
|
654 |
"\n", |
|
|
655 |
" \\\n", |
|
|
656 |
" min \n", |
|
|
657 |
"cName sName extra_info \n", |
|
|
658 |
"Generate HTML summary Sequencing Data Type NaN 0 days 00:00:35 \n", |
|
|
659 |
" BIOM type NaN 0 days 00:01:43 \n", |
|
|
660 |
"\n", |
|
|
661 |
" \\\n", |
|
|
662 |
" max \n", |
|
|
663 |
"cName sName extra_info \n", |
|
|
664 |
"Generate HTML summary Sequencing Data Type NaN 0 days 02:18:54 \n", |
|
|
665 |
" BIOM type NaN 0 days 00:02:23 \n", |
|
|
666 |
"\n", |
|
|
667 |
" MaxRSSRaw \\\n", |
|
|
668 |
" min max \n", |
|
|
669 |
"cName sName extra_info \n", |
|
|
670 |
"Generate HTML summary Sequencing Data Type NaN 56.6 MB 85.7 MB \n", |
|
|
671 |
" BIOM type NaN 278.1 MB 315.8 MB \n", |
|
|
672 |
"\n", |
|
|
673 |
" WaitTime \\\n", |
|
|
674 |
" min \n", |
|
|
675 |
"cName sName extra_info \n", |
|
|
676 |
"Generate HTML summary Sequencing Data Type NaN 0 days 00:00:00 \n", |
|
|
677 |
" BIOM type NaN 0 days 00:00:00 \n", |
|
|
678 |
"\n", |
|
|
679 |
" \n", |
|
|
680 |
" max \n", |
|
|
681 |
"cName sName extra_info \n", |
|
|
682 |
"Generate HTML summary Sequencing Data Type NaN 0 days 06:22:26 \n", |
|
|
683 |
" BIOM type NaN 0 days 00:00:01 " |
|
|
684 |
] |
|
|
685 |
}, |
|
|
686 |
"metadata": {}, |
|
|
687 |
"output_type": "display_data" |
|
|
688 |
} |
|
|
689 |
], |
|
|
690 |
"source": [ |
|
|
691 |
"# Generate HTML summary\n", |
|
|
692 |
"cmd = 'Generate HTML summary'\n", |
|
|
693 |
"summary = df[df.cName == cmd].groupby(\n", |
|
|
694 |
" ['cName', 'sName', 'extra_info'], dropna=False)[\n", |
|
|
695 |
" ['ElapsedRawTime', 'MaxRSSRaw', 'WaitTime']].agg(['count', 'min', 'max']).copy()\n", |
|
|
696 |
"summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)\n", |
|
|
697 |
"summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)\n", |
|
|
698 |
"summary.drop(columns=[('MaxRSSRaw', 'count')], inplace=True)\n", |
|
|
699 |
"summary.drop(columns=[('WaitTime', 'count')], inplace=True)\n", |
|
|
700 |
"summary.sort_values(('ElapsedRawTime', 'max'), inplace=True, ascending=False)\n", |
|
|
701 |
"\n", |
|
|
702 |
"display(summary)\n", |
|
|
703 |
"\n", |
|
|
704 |
"# As a little background: in multiple cases the `Generate HTML summary` command is run as part of the\n", |
|
|
705 |
"# Validate command\n", |
|
|
706 |
"# Note: there is no special case (like for `Validate`) for `Generate HTML summary` but the jobs are small \n", |
|
|
707 |
"# enough to be bin together\n", |
|
|
708 |
"\n", |
|
|
709 |
"# *** RESOURCE ALLOCATION ***\n", |
|
|
710 |
"\n", |
|
|
711 |
"# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) \n", |
|
|
712 |
"# VALUES ('Generate HTML summary', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 500mb --time 3:00:00');" |
|
|
713 |
] |
|
|
714 |
}, |
|
|
715 |
{ |
|
|
716 |
"cell_type": "markdown", |
|
|
717 |
"metadata": {}, |
|
|
718 |
"source": [ |
|
|
719 |
"#### Rest of Qiita jobs" |
|
|
720 |
] |
|
|
721 |
} |
|
|
722 |
], |
|
|
723 |
"metadata": { |
|
|
724 |
"kernelspec": { |
|
|
725 |
"display_name": "Python 3 (ipykernel)", |
|
|
726 |
"language": "python", |
|
|
727 |
"name": "python3" |
|
|
728 |
}, |
|
|
729 |
"language_info": { |
|
|
730 |
"codemirror_mode": { |
|
|
731 |
"name": "ipython", |
|
|
732 |
"version": 3 |
|
|
733 |
}, |
|
|
734 |
"file_extension": ".py", |
|
|
735 |
"mimetype": "text/x-python", |
|
|
736 |
"name": "python", |
|
|
737 |
"nbconvert_exporter": "python", |
|
|
738 |
"pygments_lexer": "ipython3", |
|
|
739 |
"version": "3.9.7" |
|
|
740 |
} |
|
|
741 |
}, |
|
|
742 |
"nbformat": 4, |
|
|
743 |
"nbformat_minor": 4 |
|
|
744 |
} |