a b/notebooks/resource-allocation/102023.1.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 1,
6
   "metadata": {
7
    "scrolled": false
8
   },
9
   "outputs": [],
10
   "source": [
11
    "import pandas as pd\n",
12
    "import seaborn as sns\n",
13
    "from datetime import datetime, timedelta, date\n",
14
    "from humanize import naturalsize\n",
15
    "import matplotlib.pyplot as plt\n",
16
    "import numpy as np\n",
17
    "\n",
18
    "%matplotlib inline"
19
   ]
20
  },
21
  {
22
   "cell_type": "markdown",
23
   "metadata": {},
24
   "source": [
25
    "# Qiita's resource allocation - quick update from previous version\n",
26
    "\n",
27
    "After the 2023.10 release we noticed that:\n",
28
    "1. `job-output-folder` `VALIDATE` command didn't have valid request because those jobs do not have sample/column values\n",
29
    "2. The default during a resource allocation for time is minutes and the calculations were done in seconds"
30
   ]
31
  },
32
  {
33
   "cell_type": "markdown",
34
   "metadata": {},
35
   "source": [
36
    "# Loading data\n",
37
    "\n",
38
    "First you will need to run `generate-allocation-summary.py` in Qiita as the qiita user (or whatever user runs qiita in your system). The resulting file would be: `job_[date].tsv.gz`.\n",
39
    "\n",
40
    "The generated file will have these columns: `['JobID', 'ElapsedRaw', 'MaxRSS', 'Submit', 'Start', 'MaxRSS.1', 'CPUTimeRAW', 'ReqMem', 'AllocCPUS', 'AveVMSize', 'QiitaID', 'external_id', 'sId', 'sName', 'sVersion', 'cId', 'cName', 'samples', 'columns', 'input_size', 'extra_info'],`."
41
   ]
42
  },
43
  {
44
   "cell_type": "code",
45
   "execution_count": 2,
46
   "metadata": {
47
    "scrolled": true
48
   },
49
   "outputs": [],
50
   "source": [
51
    "m1g = 2**30\n",
52
    "df = pd.read_csv('jobs_2023-10-04.tsv.gz', sep='\\t', dtype={'extra_info': str})\n",
53
    "df['ElapsedRawTime'] = pd.to_timedelta(df.ElapsedRawTime)"
54
   ]
55
  },
56
  {
57
   "cell_type": "code",
58
   "execution_count": 3,
59
   "metadata": {},
60
   "outputs": [
61
    {
62
     "data": {
63
      "text/plain": [
64
       "'There are 101147 successful jobs since we moved to barnacle2 and the largest external_id is: 1581986'"
65
      ]
66
     },
67
     "execution_count": 3,
68
     "metadata": {},
69
     "output_type": "execute_result"
70
    }
71
   ],
72
   "source": [
73
    "# for reference for the next iteration of this notebook\n",
74
    "f'There are {df.shape[0]} successful jobs since we moved to barnacle2 and the largest external_id is: {df.external_id.max()}'"
75
   ]
76
  },
77
  {
78
   "cell_type": "markdown",
79
   "metadata": {},
80
   "source": [
81
    "# 1. Getting the default values for `job-output-folder` `VALIDATE`"
82
   ]
83
  },
84
  {
85
   "cell_type": "code",
86
   "execution_count": 4,
87
   "metadata": {},
88
   "outputs": [
89
    {
90
     "data": {
91
      "text/html": [
92
       "<div>\n",
93
       "<style scoped>\n",
94
       "    .dataframe tbody tr th:only-of-type {\n",
95
       "        vertical-align: middle;\n",
96
       "    }\n",
97
       "\n",
98
       "    .dataframe tbody tr th {\n",
99
       "        vertical-align: top;\n",
100
       "    }\n",
101
       "\n",
102
       "    .dataframe thead tr th {\n",
103
       "        text-align: left;\n",
104
       "    }\n",
105
       "\n",
106
       "    .dataframe thead tr:last-of-type th {\n",
107
       "        text-align: right;\n",
108
       "    }\n",
109
       "</style>\n",
110
       "<table border=\"1\" class=\"dataframe\">\n",
111
       "  <thead>\n",
112
       "    <tr>\n",
113
       "      <th></th>\n",
114
       "      <th></th>\n",
115
       "      <th colspan=\"3\" halign=\"left\">ElapsedRawTime</th>\n",
116
       "      <th colspan=\"3\" halign=\"left\">MaxRSSRaw</th>\n",
117
       "    </tr>\n",
118
       "    <tr>\n",
119
       "      <th></th>\n",
120
       "      <th></th>\n",
121
       "      <th>count</th>\n",
122
       "      <th>min</th>\n",
123
       "      <th>max</th>\n",
124
       "      <th>count</th>\n",
125
       "      <th>min</th>\n",
126
       "      <th>max</th>\n",
127
       "    </tr>\n",
128
       "    <tr>\n",
129
       "      <th>cName</th>\n",
130
       "      <th>sName</th>\n",
131
       "      <th></th>\n",
132
       "      <th></th>\n",
133
       "      <th></th>\n",
134
       "      <th></th>\n",
135
       "      <th></th>\n",
136
       "      <th></th>\n",
137
       "    </tr>\n",
138
       "  </thead>\n",
139
       "  <tbody>\n",
140
       "    <tr>\n",
141
       "      <th rowspan=\"12\" valign=\"top\">Validate</th>\n",
142
       "      <th>BIOM type - BIOM</th>\n",
143
       "      <td>687</td>\n",
144
       "      <td>0 days 00:00:55</td>\n",
145
       "      <td>0 days 01:03:49</td>\n",
146
       "      <td>687</td>\n",
147
       "      <td>222.8 MB</td>\n",
148
       "      <td>82.0 GB</td>\n",
149
       "    </tr>\n",
150
       "    <tr>\n",
151
       "      <th>Diversity types - FeatureData</th>\n",
152
       "      <td>6</td>\n",
153
       "      <td>0 days 00:01:20</td>\n",
154
       "      <td>0 days 00:02:49</td>\n",
155
       "      <td>6</td>\n",
156
       "      <td>331.4 MB</td>\n",
157
       "      <td>384.3 MB</td>\n",
158
       "    </tr>\n",
159
       "    <tr>\n",
160
       "      <th>Diversity types - alpha_vector</th>\n",
161
       "      <td>123</td>\n",
162
       "      <td>0 days 00:01:12</td>\n",
163
       "      <td>3 days 04:36:54</td>\n",
164
       "      <td>123</td>\n",
165
       "      <td>289.3 MB</td>\n",
166
       "      <td>101.5 GB</td>\n",
167
       "    </tr>\n",
168
       "    <tr>\n",
169
       "      <th>Diversity types - distance_matrix</th>\n",
170
       "      <td>117</td>\n",
171
       "      <td>0 days 00:00:37</td>\n",
172
       "      <td>0 days 00:03:55</td>\n",
173
       "      <td>117</td>\n",
174
       "      <td>122.7 MB</td>\n",
175
       "      <td>12.5 GB</td>\n",
176
       "    </tr>\n",
177
       "    <tr>\n",
178
       "      <th>Diversity types - ordination_results</th>\n",
179
       "      <td>107</td>\n",
180
       "      <td>0 days 00:00:39</td>\n",
181
       "      <td>0 days 00:03:19</td>\n",
182
       "      <td>107</td>\n",
183
       "      <td>117.2 MB</td>\n",
184
       "      <td>2.9 GB</td>\n",
185
       "    </tr>\n",
186
       "    <tr>\n",
187
       "      <th>Sequencing Data Type - Demultiplexed</th>\n",
188
       "      <td>43</td>\n",
189
       "      <td>0 days 00:00:35</td>\n",
190
       "      <td>0 days 00:12:23</td>\n",
191
       "      <td>43</td>\n",
192
       "      <td>83.4 MB</td>\n",
193
       "      <td>517.4 MB</td>\n",
194
       "    </tr>\n",
195
       "    <tr>\n",
196
       "      <th>Sequencing Data Type - FASTA</th>\n",
197
       "      <td>2</td>\n",
198
       "      <td>0 days 00:00:56</td>\n",
199
       "      <td>0 days 00:02:23</td>\n",
200
       "      <td>2</td>\n",
201
       "      <td>79.8 MB</td>\n",
202
       "      <td>83.6 MB</td>\n",
203
       "    </tr>\n",
204
       "    <tr>\n",
205
       "      <th>Sequencing Data Type - FASTQ</th>\n",
206
       "      <td>32</td>\n",
207
       "      <td>0 days 00:00:41</td>\n",
208
       "      <td>0 days 01:50:44</td>\n",
209
       "      <td>32</td>\n",
210
       "      <td>78.7 MB</td>\n",
211
       "      <td>84.4 MB</td>\n",
212
       "    </tr>\n",
213
       "    <tr>\n",
214
       "      <th>Sequencing Data Type - SFF</th>\n",
215
       "      <td>1</td>\n",
216
       "      <td>0 days 00:01:09</td>\n",
217
       "      <td>0 days 00:01:09</td>\n",
218
       "      <td>1</td>\n",
219
       "      <td>79.6 MB</td>\n",
220
       "      <td>79.6 MB</td>\n",
221
       "    </tr>\n",
222
       "    <tr>\n",
223
       "      <th>Sequencing Data Type - per_sample_FASTQ</th>\n",
224
       "      <td>73</td>\n",
225
       "      <td>0 days 00:00:36</td>\n",
226
       "      <td>0 days 18:13:21</td>\n",
227
       "      <td>73</td>\n",
228
       "      <td>77.6 MB</td>\n",
229
       "      <td>83.6 MB</td>\n",
230
       "    </tr>\n",
231
       "    <tr>\n",
232
       "      <th>Visualization types - q2_visualization</th>\n",
233
       "      <td>133</td>\n",
234
       "      <td>0 days 00:00:36</td>\n",
235
       "      <td>0 days 00:24:56</td>\n",
236
       "      <td>133</td>\n",
237
       "      <td>51.5 MB</td>\n",
238
       "      <td>67.5 MB</td>\n",
239
       "    </tr>\n",
240
       "    <tr>\n",
241
       "      <th>qtp-job-output-folder - job-output-folder</th>\n",
242
       "      <td>228</td>\n",
243
       "      <td>0 days 00:00:31</td>\n",
244
       "      <td>0 days 00:04:06</td>\n",
245
       "      <td>228</td>\n",
246
       "      <td>18.3 MB</td>\n",
247
       "      <td>46.7 MB</td>\n",
248
       "    </tr>\n",
249
       "  </tbody>\n",
250
       "</table>\n",
251
       "</div>"
252
      ],
253
      "text/plain": [
254
       "                                                   ElapsedRawTime  \\\n",
255
       "                                                            count   \n",
256
       "cName    sName                                                      \n",
257
       "Validate BIOM type - BIOM                                     687   \n",
258
       "         Diversity types - FeatureData                          6   \n",
259
       "         Diversity types - alpha_vector                       123   \n",
260
       "         Diversity types - distance_matrix                    117   \n",
261
       "         Diversity types - ordination_results                 107   \n",
262
       "         Sequencing Data Type - Demultiplexed                  43   \n",
263
       "         Sequencing Data Type - FASTA                           2   \n",
264
       "         Sequencing Data Type - FASTQ                          32   \n",
265
       "         Sequencing Data Type - SFF                             1   \n",
266
       "         Sequencing Data Type - per_sample_FASTQ               73   \n",
267
       "         Visualization types - q2_visualization               133   \n",
268
       "         qtp-job-output-folder - job-output-folder            228   \n",
269
       "\n",
270
       "                                                                    \\\n",
271
       "                                                               min   \n",
272
       "cName    sName                                                       \n",
273
       "Validate BIOM type - BIOM                          0 days 00:00:55   \n",
274
       "         Diversity types - FeatureData             0 days 00:01:20   \n",
275
       "         Diversity types - alpha_vector            0 days 00:01:12   \n",
276
       "         Diversity types - distance_matrix         0 days 00:00:37   \n",
277
       "         Diversity types - ordination_results      0 days 00:00:39   \n",
278
       "         Sequencing Data Type - Demultiplexed      0 days 00:00:35   \n",
279
       "         Sequencing Data Type - FASTA              0 days 00:00:56   \n",
280
       "         Sequencing Data Type - FASTQ              0 days 00:00:41   \n",
281
       "         Sequencing Data Type - SFF                0 days 00:01:09   \n",
282
       "         Sequencing Data Type - per_sample_FASTQ   0 days 00:00:36   \n",
283
       "         Visualization types - q2_visualization    0 days 00:00:36   \n",
284
       "         qtp-job-output-folder - job-output-folder 0 days 00:00:31   \n",
285
       "\n",
286
       "                                                                   MaxRSSRaw  \\\n",
287
       "                                                               max     count   \n",
288
       "cName    sName                                                                 \n",
289
       "Validate BIOM type - BIOM                          0 days 01:03:49       687   \n",
290
       "         Diversity types - FeatureData             0 days 00:02:49         6   \n",
291
       "         Diversity types - alpha_vector            3 days 04:36:54       123   \n",
292
       "         Diversity types - distance_matrix         0 days 00:03:55       117   \n",
293
       "         Diversity types - ordination_results      0 days 00:03:19       107   \n",
294
       "         Sequencing Data Type - Demultiplexed      0 days 00:12:23        43   \n",
295
       "         Sequencing Data Type - FASTA              0 days 00:02:23         2   \n",
296
       "         Sequencing Data Type - FASTQ              0 days 01:50:44        32   \n",
297
       "         Sequencing Data Type - SFF                0 days 00:01:09         1   \n",
298
       "         Sequencing Data Type - per_sample_FASTQ   0 days 18:13:21        73   \n",
299
       "         Visualization types - q2_visualization    0 days 00:24:56       133   \n",
300
       "         qtp-job-output-folder - job-output-folder 0 days 00:04:06       228   \n",
301
       "\n",
302
       "                                                                        \n",
303
       "                                                         min       max  \n",
304
       "cName    sName                                                          \n",
305
       "Validate BIOM type - BIOM                           222.8 MB   82.0 GB  \n",
306
       "         Diversity types - FeatureData              331.4 MB  384.3 MB  \n",
307
       "         Diversity types - alpha_vector             289.3 MB  101.5 GB  \n",
308
       "         Diversity types - distance_matrix          122.7 MB   12.5 GB  \n",
309
       "         Diversity types - ordination_results       117.2 MB    2.9 GB  \n",
310
       "         Sequencing Data Type - Demultiplexed        83.4 MB  517.4 MB  \n",
311
       "         Sequencing Data Type - FASTA                79.8 MB   83.6 MB  \n",
312
       "         Sequencing Data Type - FASTQ                78.7 MB   84.4 MB  \n",
313
       "         Sequencing Data Type - SFF                  79.6 MB   79.6 MB  \n",
314
       "         Sequencing Data Type - per_sample_FASTQ     77.6 MB   83.6 MB  \n",
315
       "         Visualization types - q2_visualization      51.5 MB   67.5 MB  \n",
316
       "         qtp-job-output-folder - job-output-folder   18.3 MB   46.7 MB  "
317
      ]
318
     },
319
     "metadata": {},
320
     "output_type": "display_data"
321
    }
322
   ],
323
   "source": [
324
    "cname = 'Validate'\n",
325
    "_df = df[(df.cName == cname)].copy()\n",
326
    "\n",
327
    "summary = _df[_df['samples'].isnull() & _df['columns'].isnull()].groupby(\n",
328
    "    ['cName', 'sName'])[['ElapsedRawTime', 'MaxRSSRaw']].agg(['count', 'min', 'max']).copy()\n",
329
    "\n",
330
    "summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)\n",
331
    "summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)\n",
332
    "\n",
333
    "display(summary)\n",
334
    "\n",
335
    "# New allocation: -p qiita -N 1 -n 1 --mem 100mb --time 00:40:00"
336
   ]
337
  },
338
  {
339
   "cell_type": "markdown",
340
   "metadata": {},
341
   "source": [
342
    "# 2. Updates for the seconds to minute confusion"
343
   ]
344
  },
345
  {
346
   "cell_type": "markdown",
347
   "metadata": {},
348
   "source": [
349
    "\n",
350
    "=============\n",
351
    "=============\n",
352
    "     allocation = '-p qiita -N 1 -n 1 '\n",
353
    "            || '--mem (2**30)+({samples}*150000) '\n",
354
    "            || '--time 240'\n",
355
    "\n",
356
    "UPDATE qiita.processing_job_resource_allocation set\n",
357
    "    allocation = '-p qiita -N 1 -n 1 '\n",
358
    "           || '--mem (2**30)+({samples}*150000) '\n",
359
    "           || '--time 4'\n",
360
    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
361
    "    name = 'delete_sample_or_column';\n",
362
    "\n",
363
    "=============\n",
364
    "=============\n",
365
    "     allocation = '-p qiita -N 1 -n 1 '\n",
366
    "            || '--mem {samples}*10000000'\n",
367
    "            || '--time 61200'\n",
368
    "\n",
369
    "UPDATE qiita.processing_job_resource_allocation set\n",
370
    "    allocation = '-p qiita -N 1 -n 1 '\n",
371
    "           || '--mem {samples}*10000000'\n",
372
    "           || '--time 1020'\n",
373
    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
374
    "    name = 'Sequence Processing Pipeline';\n",
375
    "\n",
376
    "=============\n",
377
    "=============\n",
378
    "     allocation = '-p qiita -N 1 -n 1 --mem 4g --time 900'\n",
379
    "\n",
380
    "UPDATE qiita.processing_job_resource_allocation set\n",
381
    "    allocation = '-p qiita -N 1 -n 1 --mem 4g --time 15'\n",
382
    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
383
    "    name = 'Filter samples from table [filter_samples]';\n",
384
    "\n",
385
    "=============\n",
386
    "=============\n",
387
    "     allocation = '-p qiita -N 1 -n 1 '\n",
388
    "            || '--mem (2**31)+({input_size}*6) if\n",
389
    "(2**31)+({input_size}*6) < 13958643712 else 13958643712 '\n",
390
    "            || '--time 2400'\n",
391
    "\n",
392
    "UPDATE qiita.processing_job_resource_allocation set\n",
393
    "    allocation = '-p qiita -N 1 -n 1 '\n",
394
    "           || '--mem (2**31)+({input_size}*6) if\n",
395
    "(2**31)+({input_size}*6) < 13958643712 else 13958643712 '\n",
396
    "           || '--time 40'\n",
397
    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
398
    "    name = 'Rarefy table [rarefy]';\n",
399
    "\n",
400
    "=============\n",
401
    "=============\n",
402
    "     allocation = '-p qiita -N 1 -n 1 '\n",
403
    "            || '--mem 14g'\n",
404
    "            || '--time 360'\n",
405
    "\n",
406
    "UPDATE qiita.processing_job_resource_allocation set\n",
407
    "    allocation = '-p qiita -N 1 -n 1 '\n",
408
    "           || '--mem 14g '\n",
409
    "           || '--time 6'\n",
410
    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
411
    "    name = 'Alpha diversity (phylogenetic) [alpha_phylogenetic]';\n",
412
    "\n",
413
    "\n",
414
    "=============\n",
415
    "=============\n",
416
    "     allocation = '-p qiita -N 1 -n 1 '\n",
417
    "            || '--mem (2**33)+(2**30)+(({samples}*{columns}*{input_size})/4500000)'\n",
418
    "            || '--time 1800'\n",
419
    "\n",
420
    "UPDATE qiita.processing_job_resource_allocation set\n",
421
    "    allocation = '-p qiita -N 1 -n 1 '\n",
422
    "           || '--mem\n",
423
    "(2**33)+(2**30)+(({samples}*{columns}*{input_size})/4500000) '\n",
424
    "           || '--time 30'\n",
425
    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
426
    "    name = 'Visualize and Interact with Principal Coordinates Analysis\n",
427
    "Plots [plot]';\n",
428
    "\n",
429
    "=============\n",
430
    "=============\n",
431
    "     allocation = '-p qiita -N 1 -n 1 '\n",
432
    "            || '--mem (2**32)+(({samples}*{columns}*{input_size}')/20000)'\n",
433
    "            || '--time 90000'\n",
434
    "\n",
435
    "UPDATE qiita.processing_job_resource_allocation set\n",
436
    "    allocation = '-p qiita -N 1 -n 1 '\n",
437
    "           || '--mem (2**32)+(({samples}*{columns}*{input_size})/20000) '\n",
438
    "           || '--time 1500'\n",
439
    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
440
    "    name = 'Alpha rarefaction curves [alpha_rarefaction]';\n",
441
    "\n",
442
    "=============\n",
443
    "=============\n",
444
    "     allocation = '-p qiita -N 1 -n 1 '\n",
445
    "            || '--mem 2*(2**30)+{input_size} if 2*(2**30)+{input_size} < 16*(2**30) else 16*(2**30)'\n",
446
    "            || '--time 36000'\n",
447
    "\n",
448
    "UPDATE qiita.processing_job_resource_allocation set\n",
449
    "    allocation = '-p qiita -N 1 -n 1 '\n",
450
    "           || '--mem 2*(2**30)+{input_size} if 2*(2**30)+{input_size}\n",
451
    "< 16*(2**30) else 16*(2**30) '\n",
452
    "           || '--time 600'\n",
453
    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
454
    "    name = 'Trimming';\n",
455
    "\n",
456
    "=============\n",
457
    "=============\n",
458
    "     allocation = '-p qiita -N 1 -n 1 '\n",
459
    "            || '--mem (2**30)+({samples}*{columns}*2000)'\n",
460
    "            || '--time 2300'\n",
461
    "\n",
462
    "UPDATE qiita.processing_job_resource_allocation set\n",
463
    "    allocation = '-p qiita -N 1 -n 1 '\n",
464
    "           || '--mem (2**30)+({samples}*{columns}*2000) '\n",
465
    "           || '--time 39'\n",
466
    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
467
    "    name = 'update_sample_template';"
468
   ]
469
  },
470
  {
471
   "cell_type": "code",
472
   "execution_count": null,
473
   "metadata": {},
474
   "outputs": [],
475
   "source": []
476
  }
477
 ],
478
 "metadata": {
479
  "kernelspec": {
480
   "display_name": "Python 3 (ipykernel)",
481
   "language": "python",
482
   "name": "python3"
483
  },
484
  "language_info": {
485
   "codemirror_mode": {
486
    "name": "ipython",
487
    "version": 3
488
   },
489
   "file_extension": ".py",
490
   "mimetype": "text/x-python",
491
   "name": "python",
492
   "nbconvert_exporter": "python",
493
   "pygments_lexer": "ipython3",
494
   "version": "3.9.7"
495
  }
496
 },
497
 "nbformat": 4,
498
 "nbformat_minor": 4
499
}