In [1]:
import pandas as pd
import seaborn as sns
from datetime import datetime, timedelta, date
from humanize import naturalsize
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

# Qiita's resource allocation

This notebook walks through how to load & parse the job stats from Qiita. It additionally tries to split the different commands by their resource utilization and make sure to be as accurate/fair to request them. Here resource allocations are mainly walltime (`ElapsedRawTime`), memory (`MaxRSSRaw`) and the time a job took to start running (`WaitTime`: Start - Submit). 

# Loading data

First you will need to run `generate-allocation-summary.py` in Qiita as the qiita user (or whatever user runs qiita in your system). The resulting file would be: `job_[date].tsv.gz`.

The generated file will have these columns: `['JobID', 'ElapsedRaw', 'MaxRSS', 'Submit', 'Start', 'MaxRSS.1', 'CPUTimeRAW', 'ReqMem', 'AllocCPUS', 'AveVMSize', 'QiitaID', 'external_id', 'sId', 'sName', 'sVersion', 'cId', 'cName', 'samples', 'columns', 'input_size', 'extra_info'],`.

In [2]:
m1g = 2**30
df = pd.read_csv('jobs_2023-10-31.tsv.gz', sep='\t', dtype={'extra_info': str})
df['ElapsedRawTime'] = pd.to_timedelta(df.ElapsedRawTime)

In [3]:
# for reference for the next iteration of this notebook
f'There are {df.shape[0]} successful jobs since we moved to barnacle2 and the largest external_id is: {df.external_id.max()}'

'There are 106548 successful jobs since we moved to barnacle2 and the largest external_id is: 1614116'

# Deciding what to optimize and what to leave with a default value

In the previous versions (072023, 102023, 102023.1) we decided to only optimize things that are using more than 4gb or 4hrs and now we want to review commands that are below 4g and 4hrs so we add specific parameters for them.

In [4]:
summary = df.groupby(['cName', 'sName'])[
        ['ElapsedRawTime', 'MaxRSSRaw']].agg(['count', 'min', 'max']).copy()

# We are gonna focus on jobs that request more than 4gb or take more than 4 hrs.
summary = summary[(summary[('MaxRSSRaw', 'max')] < 4*m1g) & 
                      (summary[('ElapsedRawTime', 'max')] < timedelta(hours=4))]

summary.sort_values(('MaxRSSRaw', 'count'), inplace=True, ascending=False)
summary.drop(columns=[('MaxRSSRaw', 'count')], inplace=True)

# ignore commands with less than 40 jobs to avoid over fitting early
summary = summary[summary[('ElapsedRawTime', 'count')] > 40]

# ignore commands that were optimized on the previous notebooks - as part of larger sets
# summary = summary[]
summary = summary[summary.index.get_level_values('cName') != 'Validate']

summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)
summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)

_df = summary[summary.index.get_level_values('sName') != 'qiime2']
print ("qiita:", _df.shape[0])
display(_df)

_df = summary[summary.index.get_level_values('sName') == 'qiime2']
print ("qiime2:", _df.shape[0])
display(_df)

# *** RESOURCE ALLOCATION ***

# Qiita jobs 
# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) 
#     VALUES 
#     ('delete_artifact', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 3:00:00'),
#     ('create_sample_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 600mb --time 00:20:00'),
#     ('delete_analysis', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:10:00'),
#     ('download_remote_files', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 4:00:00'),
#     ('delete_sample_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:30:00'),
#     ('delete_study', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:30:00'),
#     ('update_prep_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:05:00'),
#     ('copy_artifact', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 1:00:00'),
#     ('list_remote_files', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:05:00');

# Q2 jobs
# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) 
#     VALUES 
#     ('adonis PERMANOVA test for beta group significance [adonis]', 'RESOURCE_PARAMS_COMMAND', 
#      '-p qiita -N 1 -n 1 --mem 4gb --time 4:00:00'),
#     ('Core diversity metrics (non-phylogenetic) [core_metrics]', 'RESOURCE_PARAMS_COMMAND', 
#      '-p qiita -N 1 -n 1 --mem 6gb --time 1:00:00'),
#     ('Taxonomy-based feature table filter. [filter_table]', 'RESOURCE_PARAMS_COMMAND', 
#      '-p qiita -N 1 -n 1 --mem 4gb --time 00:20:00'),
#     ('Summarize table [summarize]', 'RESOURCE_PARAMS_COMMAND', 
#      '-p qiita -N 1 -n 1 --mem 4gb --time 00:10:00'),
#     ('Add pseudocount to table. [add_pseudocount]', 'RESOURCE_PARAMS_COMMAND', 
#      '-p qiita -N 1 -n 1 --mem 3.5gb --time 00:15:00'),
#     ('Filter features from a table based on abundance and prevalence [filter_features_conditionally]', 
#      'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 1gb --time 00:10:00'),
#     ('Identify core features in table [core_features]', 'RESOURCE_PARAMS_COMMAND', 
#      '-p qiita -N 1 -n 1 --mem 3.5gb --time 2:00:00'),
#     ('Filter features from table [filter_features]', 'RESOURCE_PARAMS_COMMAND', 
#      '-p qiita -N 1 -n 1 --mem 500mb --time 00:10:00');    

qiita: 10


Unnamed: 0_level_0,Unnamed: 1_level_0,ElapsedRawTime,ElapsedRawTime,ElapsedRawTime,MaxRSSRaw,MaxRSSRaw
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,min,max
cName,sName,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
delete_artifact,Qiita,1534,0 days 00:00:03,0 days 02:48:08,0 Bytes,122.2 MB
create_sample_template,Qiita,569,0 days 00:00:03,0 days 00:11:22,0 Bytes,415.8 MB
delete_analysis,Qiita,320,0 days 00:00:03,0 days 00:06:13,0 Bytes,120.8 MB
download_remote_files,Qiita,194,0 days 00:00:07,0 days 03:29:36,0 Bytes,128.9 MB
delete_sample_template,Qiita,181,0 days 00:00:04,0 days 00:19:31,0 Bytes,120.6 MB
delete_study,Qiita,136,0 days 00:00:03,0 days 00:16:09,0 Bytes,125.5 MB
update_prep_template,Qiita,126,0 days 00:00:03,0 days 00:02:25,0 Bytes,125.3 MB
copy_artifact,Qiita,101,0 days 00:00:06,0 days 00:33:16,0 Bytes,124.1 MB
Generate HTML summary,Sequencing Data Type,78,0 days 00:00:35,0 days 02:18:54,56.6 MB,85.7 MB
list_remote_files,Qiita,47,0 days 00:00:05,0 days 00:02:21,0 Bytes,121.7 MB


qiime2: 8


Unnamed: 0_level_0,Unnamed: 1_level_0,ElapsedRawTime,ElapsedRawTime,ElapsedRawTime,MaxRSSRaw,MaxRSSRaw
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,min,max
cName,sName,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
adonis PERMANOVA test for beta group significance [adonis],qiime2,552,0 days 00:00:57,0 days 00:39:12,147.4 MB,3.5 GB
Core diversity metrics (non-phylogenetic) [core_metrics],qiime2,100,0 days 00:02:17,0 days 00:25:31,213.5 MB,4.3 GB
Taxonomy-based feature table filter. [filter_table],qiime2,74,0 days 00:00:52,0 days 00:19:37,214.9 MB,2.6 GB
Summarize table [summarize],qiime2,64,0 days 00:00:56,0 days 00:05:54,229.8 MB,3.0 GB
Add pseudocount to table. [add_pseudocount],qiime2,55,0 days 00:01:04,0 days 00:06:14,242.5 MB,2.9 GB
Filter features from a table based on abundance and prevalence [filter_features_conditionally],qiime2,53,0 days 00:00:53,0 days 00:02:33,212.4 MB,553.3 MB
Identify core features in table [core_features],qiime2,49,0 days 00:01:03,0 days 00:59:29,212.9 MB,2.6 GB
Filter features from table [filter_features],qiime2,48,0 days 00:00:47,0 days 00:03:34,208.3 MB,398.4 MB


## Optimizing Qiita processing jobs.

As a remider, we can use:
- 'samples'
- 'columns'
- 'input_size'
- 'extra_info': this is when the current method doesn't provide the required info or we need to update it; this info comes from `job_stats_generation.py`

Extra from the list of commands we should take a close look at `Generate HTML summary`

#### Generate HTML summary

In [5]:
# Generate HTML summary
cmd = 'Generate HTML summary'
summary = df[df.cName == cmd].groupby(
    ['cName', 'sName', 'extra_info'], dropna=False)[
    ['ElapsedRawTime', 'MaxRSSRaw', 'WaitTime']].agg(['count', 'min', 'max']).copy()
summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)
summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)
summary.drop(columns=[('MaxRSSRaw', 'count')], inplace=True)
summary.drop(columns=[('WaitTime', 'count')], inplace=True)
summary.sort_values(('ElapsedRawTime', 'max'), inplace=True, ascending=False)

display(summary)

# As a little background: in multiple cases the `Generate HTML summary` command is run as part of the
#                         Validate command
# Note: there is no special case (like for `Validate`) for `Generate HTML summary` but the jobs are small 
#       enough to be bin together

# *** RESOURCE ALLOCATION ***

# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) 
#     VALUES ('Generate HTML summary', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 500mb --time 3:00:00');

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ElapsedRawTime,ElapsedRawTime,ElapsedRawTime,MaxRSSRaw,MaxRSSRaw,WaitTime,WaitTime
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,min,max,min,max,min,max
cName,sName,extra_info,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Generate HTML summary,Sequencing Data Type,,78,0 days 00:00:35,0 days 02:18:54,56.6 MB,85.7 MB,0 days 00:00:00,0 days 06:22:26
Generate HTML summary,BIOM type,,2,0 days 00:01:43,0 days 00:02:23,278.1 MB,315.8 MB,0 days 00:00:00,0 days 00:00:01


#### Rest of Qiita jobs