|
a |
|
b/qiita_db/meta_util.py |
|
|
1 |
r""" |
|
|
2 |
Util functions (:mod: `qiita_db.meta_util`) |
|
|
3 |
=========================================== |
|
|
4 |
|
|
|
5 |
..currentmodule:: qiita_db.meta_util |
|
|
6 |
|
|
|
7 |
This module provides utility functions that use the ORM objects. ORM objects |
|
|
8 |
CANNOT import from this file. |
|
|
9 |
|
|
|
10 |
Methods |
|
|
11 |
------- |
|
|
12 |
|
|
|
13 |
..autosummary:: |
|
|
14 |
:toctree: generated/ |
|
|
15 |
|
|
|
16 |
get_lat_longs |
|
|
17 |
""" |
|
|
18 |
# ----------------------------------------------------------------------------- |
|
|
19 |
# Copyright (c) 2014--, The Qiita Development Team. |
|
|
20 |
# |
|
|
21 |
# Distributed under the terms of the BSD 3-clause License. |
|
|
22 |
# |
|
|
23 |
# The full license is in the file LICENSE, distributed with this software. |
|
|
24 |
# ----------------------------------------------------------------------------- |
|
|
25 |
from os import stat |
|
|
26 |
from shutil import move |
|
|
27 |
from os.path import join, relpath, basename |
|
|
28 |
from time import strftime, localtime |
|
|
29 |
import matplotlib.pyplot as plt |
|
|
30 |
import matplotlib as mpl |
|
|
31 |
from base64 import b64encode |
|
|
32 |
from urllib.parse import quote |
|
|
33 |
from io import BytesIO |
|
|
34 |
from datetime import datetime |
|
|
35 |
from collections import defaultdict, Counter |
|
|
36 |
from tarfile import open as topen, TarInfo |
|
|
37 |
from hashlib import md5 |
|
|
38 |
from re import sub |
|
|
39 |
from json import loads, dump, dumps |
|
|
40 |
|
|
|
41 |
from qiita_db.util import create_nested_path, retrieve_resource_data |
|
|
42 |
from qiita_db.util import resource_allocation_plot |
|
|
43 |
from qiita_core.qiita_settings import qiita_config, r_client |
|
|
44 |
from qiita_core.configuration_manager import ConfigurationManager |
|
|
45 |
import qiita_db as qdb |
|
|
46 |
|
|
|
47 |
# global constant list used in resource_allocation_page |
|
|
48 |
COLUMNS = [ |
|
|
49 |
"sName", "sVersion", "cID", "cName", "processing_job_id", |
|
|
50 |
"parameters", "samples", "columns", "input_size", "extra_info", |
|
|
51 |
"MaxRSSRaw", "ElapsedRaw", "Start", "node_name", "node_model"] |
|
|
52 |
RAW_DATA_ARTIFACT_TYPE = { |
|
|
53 |
'SFF', 'FASTQ', 'FASTA', 'FASTA_Sanger', 'per_sample_FASTQ'} |
|
|
54 |
|
|
|
55 |
|
|
|
56 |
def _get_data_fpids(constructor, object_id): |
|
|
57 |
"""Small function for getting filepath IDS associated with data object |
|
|
58 |
|
|
|
59 |
Parameters |
|
|
60 |
---------- |
|
|
61 |
constructor : a subclass of BaseData |
|
|
62 |
E.g., RawData, PreprocessedData, or ProcessedData |
|
|
63 |
object_id : int |
|
|
64 |
The ID of the data object |
|
|
65 |
|
|
|
66 |
Returns |
|
|
67 |
------- |
|
|
68 |
set of int |
|
|
69 |
""" |
|
|
70 |
with qdb.sql_connection.TRN: |
|
|
71 |
obj = constructor(object_id) |
|
|
72 |
return {fpid for fpid, _, _ in obj.get_filepaths()} |
|
|
73 |
|
|
|
74 |
|
|
|
75 |
def validate_filepath_access_by_user(user, filepath_id): |
|
|
76 |
"""Validates if the user has access to the filepath_id |
|
|
77 |
|
|
|
78 |
Parameters |
|
|
79 |
---------- |
|
|
80 |
user : User object |
|
|
81 |
The user we are interested in |
|
|
82 |
filepath_id : int |
|
|
83 |
The filepath id |
|
|
84 |
|
|
|
85 |
Returns |
|
|
86 |
------- |
|
|
87 |
bool |
|
|
88 |
If the user has access or not to the filepath_id |
|
|
89 |
|
|
|
90 |
Notes |
|
|
91 |
----- |
|
|
92 |
Admins have access to all files so True is always returned |
|
|
93 |
""" |
|
|
94 |
TRN = qdb.sql_connection.TRN |
|
|
95 |
with TRN: |
|
|
96 |
if user.level == "admin": |
|
|
97 |
# admins have access all files |
|
|
98 |
return True |
|
|
99 |
|
|
|
100 |
sql = """SELECT |
|
|
101 |
(SELECT array_agg(artifact_id) |
|
|
102 |
FROM qiita.artifact_filepath |
|
|
103 |
WHERE filepath_id = {0}) AS artifact, |
|
|
104 |
(SELECT array_agg(study_id) |
|
|
105 |
FROM qiita.sample_template_filepath |
|
|
106 |
WHERE filepath_id = {0}) AS sample_info, |
|
|
107 |
(SELECT array_agg(prep_template_id) |
|
|
108 |
FROM qiita.prep_template_filepath |
|
|
109 |
WHERE filepath_id = {0}) AS prep_info, |
|
|
110 |
(SELECT array_agg(analysis_id) |
|
|
111 |
FROM qiita.analysis_filepath |
|
|
112 |
WHERE filepath_id = {0}) AS analysis""".format(filepath_id) |
|
|
113 |
TRN.add(sql) |
|
|
114 |
|
|
|
115 |
arid, sid, pid, anid = TRN.execute_fetchflatten() |
|
|
116 |
|
|
|
117 |
# artifacts |
|
|
118 |
if arid: |
|
|
119 |
# [0] cause we should only have 1 |
|
|
120 |
artifact = qdb.artifact.Artifact(arid[0]) |
|
|
121 |
|
|
|
122 |
if artifact.visibility == 'public': |
|
|
123 |
# TODO: https://github.com/biocore/qiita/issues/1724 |
|
|
124 |
if artifact.artifact_type in RAW_DATA_ARTIFACT_TYPE: |
|
|
125 |
study = artifact.study |
|
|
126 |
has_access = study.has_access(user, no_public=True) |
|
|
127 |
if (not study.public_raw_download and not has_access): |
|
|
128 |
return False |
|
|
129 |
return True |
|
|
130 |
else: |
|
|
131 |
study = artifact.study |
|
|
132 |
if study: |
|
|
133 |
# let's take the visibility via the Study |
|
|
134 |
return artifact.study.has_access(user) |
|
|
135 |
else: |
|
|
136 |
analysis = artifact.analysis |
|
|
137 |
return analysis in ( |
|
|
138 |
user.private_analyses | user.shared_analyses) |
|
|
139 |
# sample info files |
|
|
140 |
elif sid: |
|
|
141 |
# the visibility of the sample info file is given by the |
|
|
142 |
# study visibility |
|
|
143 |
# [0] cause we should only have 1 |
|
|
144 |
return qdb.study.Study(sid[0]).has_access(user) |
|
|
145 |
# prep info files |
|
|
146 |
elif pid: |
|
|
147 |
# the prep access is given by it's artifacts, if the user has |
|
|
148 |
# access to any artifact, it should have access to the prep |
|
|
149 |
# [0] cause we should only have 1 |
|
|
150 |
pt = qdb.metadata_template.prep_template.PrepTemplate( |
|
|
151 |
pid[0]) |
|
|
152 |
a = pt.artifact |
|
|
153 |
# however, the prep info file could not have any artifacts attached |
|
|
154 |
# , in that case we will use the study access level |
|
|
155 |
if a is None: |
|
|
156 |
return qdb.study.Study(pt.study_id).has_access(user) |
|
|
157 |
else: |
|
|
158 |
if (a.visibility == 'public' or a.study.has_access(user)): |
|
|
159 |
return True |
|
|
160 |
else: |
|
|
161 |
for c in a.descendants.nodes(): |
|
|
162 |
if ((c.visibility == 'public' or |
|
|
163 |
c.study.has_access(user))): |
|
|
164 |
return True |
|
|
165 |
return False |
|
|
166 |
# analyses |
|
|
167 |
elif anid: |
|
|
168 |
# [0] cause we should only have 1 |
|
|
169 |
aid = anid[0] |
|
|
170 |
analysis = qdb.analysis.Analysis(aid) |
|
|
171 |
return analysis.is_public | (analysis in ( |
|
|
172 |
user.private_analyses | user.shared_analyses)) |
|
|
173 |
return False |
|
|
174 |
|
|
|
175 |
|
|
|
176 |
def update_redis_stats(): |
|
|
177 |
"""Generate the system stats and save them in redis |
|
|
178 |
|
|
|
179 |
Returns |
|
|
180 |
------- |
|
|
181 |
list of str |
|
|
182 |
artifact filepaths that are not present in the file system |
|
|
183 |
""" |
|
|
184 |
STUDY = qdb.study.Study |
|
|
185 |
|
|
|
186 |
number_studies = {'public': 0, 'private': 0, 'sandbox': 0} |
|
|
187 |
number_of_samples = {'public': 0, 'private': 0, 'sandbox': 0} |
|
|
188 |
num_studies_ebi = 0 |
|
|
189 |
num_samples_ebi = 0 |
|
|
190 |
number_samples_ebi_prep = 0 |
|
|
191 |
stats = [] |
|
|
192 |
missing_files = [] |
|
|
193 |
per_data_type_stats = Counter() |
|
|
194 |
for study in STUDY.iter(): |
|
|
195 |
st = study.sample_template |
|
|
196 |
if st is None: |
|
|
197 |
continue |
|
|
198 |
|
|
|
199 |
# counting samples submitted to EBI-ENA |
|
|
200 |
len_samples_ebi = sum([esa is not None |
|
|
201 |
for esa in st.ebi_sample_accessions.values()]) |
|
|
202 |
if len_samples_ebi != 0: |
|
|
203 |
num_studies_ebi += 1 |
|
|
204 |
num_samples_ebi += len_samples_ebi |
|
|
205 |
|
|
|
206 |
samples_status = defaultdict(set) |
|
|
207 |
for pt in study.prep_templates(): |
|
|
208 |
pt_samples = list(pt.keys()) |
|
|
209 |
pt_status = pt.status |
|
|
210 |
if pt_status == 'public': |
|
|
211 |
per_data_type_stats[pt.data_type()] += len(pt_samples) |
|
|
212 |
samples_status[pt_status].update(pt_samples) |
|
|
213 |
# counting experiments (samples in preps) submitted to EBI-ENA |
|
|
214 |
number_samples_ebi_prep += sum([ |
|
|
215 |
esa is not None |
|
|
216 |
for esa in pt.ebi_experiment_accessions.values()]) |
|
|
217 |
|
|
|
218 |
# counting studies |
|
|
219 |
if 'public' in samples_status: |
|
|
220 |
number_studies['public'] += 1 |
|
|
221 |
elif 'private' in samples_status: |
|
|
222 |
number_studies['private'] += 1 |
|
|
223 |
else: |
|
|
224 |
# note that this is a catch all for other status; at time of |
|
|
225 |
# writing there is status: awaiting_approval |
|
|
226 |
number_studies['sandbox'] += 1 |
|
|
227 |
|
|
|
228 |
# counting samples; note that some of these lines could be merged with |
|
|
229 |
# the block above but I decided to split it in 2 for clarity |
|
|
230 |
if 'public' in samples_status: |
|
|
231 |
number_of_samples['public'] += len(samples_status['public']) |
|
|
232 |
if 'private' in samples_status: |
|
|
233 |
number_of_samples['private'] += len(samples_status['private']) |
|
|
234 |
if 'sandbox' in samples_status: |
|
|
235 |
number_of_samples['sandbox'] += len(samples_status['sandbox']) |
|
|
236 |
|
|
|
237 |
# processing filepaths |
|
|
238 |
for artifact in study.artifacts(): |
|
|
239 |
for adata in artifact.filepaths: |
|
|
240 |
try: |
|
|
241 |
s = stat(adata['fp']) |
|
|
242 |
except OSError: |
|
|
243 |
missing_files.append(adata['fp']) |
|
|
244 |
else: |
|
|
245 |
stats.append( |
|
|
246 |
(adata['fp_type'], s.st_size, strftime('%Y-%m', |
|
|
247 |
localtime(s.st_mtime)))) |
|
|
248 |
|
|
|
249 |
num_users = qdb.util.get_count('qiita.qiita_user') |
|
|
250 |
num_processing_jobs = qdb.util.get_count('qiita.processing_job') |
|
|
251 |
|
|
|
252 |
lat_longs = dumps(get_lat_longs()) |
|
|
253 |
|
|
|
254 |
summary = {} |
|
|
255 |
all_dates = [] |
|
|
256 |
# these are some filetypes that are too small to plot alone so we'll merge |
|
|
257 |
# in other |
|
|
258 |
group_other = {'html_summary', 'tgz', 'directory', 'raw_fasta', 'log', |
|
|
259 |
'raw_sff', 'raw_qual', 'qza', 'html_summary_dir', |
|
|
260 |
'qza', 'plain_text', 'raw_barcodes'} |
|
|
261 |
for ft, size, ym in stats: |
|
|
262 |
if ft in group_other: |
|
|
263 |
ft = 'other' |
|
|
264 |
if ft not in summary: |
|
|
265 |
summary[ft] = {} |
|
|
266 |
if ym not in summary[ft]: |
|
|
267 |
summary[ft][ym] = 0 |
|
|
268 |
all_dates.append(ym) |
|
|
269 |
summary[ft][ym] += size |
|
|
270 |
all_dates = sorted(set(all_dates)) |
|
|
271 |
|
|
|
272 |
# sorting summaries |
|
|
273 |
ordered_summary = {} |
|
|
274 |
for dt in summary: |
|
|
275 |
new_list = [] |
|
|
276 |
current_value = 0 |
|
|
277 |
for ad in all_dates: |
|
|
278 |
if ad in summary[dt]: |
|
|
279 |
current_value += summary[dt][ad] |
|
|
280 |
new_list.append(current_value) |
|
|
281 |
ordered_summary[dt] = new_list |
|
|
282 |
|
|
|
283 |
plot_order = sorted([(k, ordered_summary[k][-1]) for k in ordered_summary], |
|
|
284 |
key=lambda x: x[1]) |
|
|
285 |
|
|
|
286 |
# helper function to generate y axis, modified from: |
|
|
287 |
# http://stackoverflow.com/a/1094933 |
|
|
288 |
def sizeof_fmt(value, position): |
|
|
289 |
number = None |
|
|
290 |
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: |
|
|
291 |
if abs(value) < 1024.0: |
|
|
292 |
number = "%3.1f%s" % (value, unit) |
|
|
293 |
break |
|
|
294 |
value /= 1024.0 |
|
|
295 |
if number is None: |
|
|
296 |
number = "%.1f%s" % (value, 'Yi') |
|
|
297 |
return number |
|
|
298 |
|
|
|
299 |
all_dates_axis = range(len(all_dates)) |
|
|
300 |
plt.locator_params(axis='y', nbins=10) |
|
|
301 |
plt.figure(figsize=(20, 10)) |
|
|
302 |
for k, v in plot_order: |
|
|
303 |
plt.plot(all_dates_axis, ordered_summary[k], linewidth=2, label=k) |
|
|
304 |
|
|
|
305 |
plt.xticks(all_dates_axis, all_dates) |
|
|
306 |
plt.legend() |
|
|
307 |
plt.grid() |
|
|
308 |
ax = plt.gca() |
|
|
309 |
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(sizeof_fmt)) |
|
|
310 |
plt.xticks(rotation=90) |
|
|
311 |
plt.xlabel('Date') |
|
|
312 |
plt.ylabel('Storage space per data type') |
|
|
313 |
|
|
|
314 |
plot = BytesIO() |
|
|
315 |
plt.savefig(plot, format='png') |
|
|
316 |
plot.seek(0) |
|
|
317 |
img = 'data:image/png;base64,' + quote(b64encode(plot.getbuffer())) |
|
|
318 |
|
|
|
319 |
time = datetime.now().strftime('%m-%d-%y %H:%M:%S') |
|
|
320 |
|
|
|
321 |
portal = qiita_config.portal |
|
|
322 |
# making sure per_data_type_stats has some data so hmset doesn't fail |
|
|
323 |
if per_data_type_stats == {}: |
|
|
324 |
per_data_type_stats['No data'] = 0 |
|
|
325 |
|
|
|
326 |
vals = [ |
|
|
327 |
('number_studies', number_studies, r_client.hmset), |
|
|
328 |
('number_of_samples', number_of_samples, r_client.hmset), |
|
|
329 |
('per_data_type_stats', dict(per_data_type_stats), r_client.hmset), |
|
|
330 |
('num_users', num_users, r_client.set), |
|
|
331 |
('lat_longs', (lat_longs), r_client.set), |
|
|
332 |
('num_studies_ebi', num_studies_ebi, r_client.set), |
|
|
333 |
('num_samples_ebi', num_samples_ebi, r_client.set), |
|
|
334 |
('number_samples_ebi_prep', number_samples_ebi_prep, r_client.set), |
|
|
335 |
('img', img, r_client.set), |
|
|
336 |
('time', time, r_client.set), |
|
|
337 |
('num_processing_jobs', num_processing_jobs, r_client.set)] |
|
|
338 |
for k, v, f in vals: |
|
|
339 |
redis_key = '%s:stats:%s' % (portal, k) |
|
|
340 |
# important to "flush" variables to avoid errors |
|
|
341 |
r_client.delete(redis_key) |
|
|
342 |
f(redis_key, v) |
|
|
343 |
|
|
|
344 |
# preparing vals to insert into DB |
|
|
345 |
vals = dumps(dict([x[:-1] for x in vals])) |
|
|
346 |
sql = """INSERT INTO qiita.stats_daily (stats, stats_timestamp) |
|
|
347 |
VALUES (%s, NOW())""" |
|
|
348 |
qdb.sql_connection.perform_as_transaction(sql, [vals]) |
|
|
349 |
|
|
|
350 |
return missing_files |
|
|
351 |
|
|
|
352 |
|
|
|
353 |
def get_lat_longs(): |
|
|
354 |
"""Retrieve the latitude and longitude of all the public samples in the DB |
|
|
355 |
|
|
|
356 |
Returns |
|
|
357 |
------- |
|
|
358 |
list of [float, float] |
|
|
359 |
The latitude and longitude for each sample in the database |
|
|
360 |
""" |
|
|
361 |
with qdb.sql_connection.TRN: |
|
|
362 |
# getting all the public studies |
|
|
363 |
studies = qdb.study.Study.get_by_status('public') |
|
|
364 |
|
|
|
365 |
results = [] |
|
|
366 |
if studies: |
|
|
367 |
# we are going to create multiple union selects to retrieve the |
|
|
368 |
# latigute and longitude of all available studies. Note that |
|
|
369 |
# UNION in PostgreSQL automatically removes duplicates |
|
|
370 |
sql_query = """ |
|
|
371 |
SELECT {0}, CAST(sample_values->>'latitude' AS FLOAT), |
|
|
372 |
CAST(sample_values->>'longitude' AS FLOAT) |
|
|
373 |
FROM qiita.sample_{0} |
|
|
374 |
WHERE sample_values->>'latitude' != 'NaN' AND |
|
|
375 |
sample_values->>'longitude' != 'NaN' AND |
|
|
376 |
isnumeric(sample_values->>'latitude') AND |
|
|
377 |
isnumeric(sample_values->>'longitude')""" |
|
|
378 |
sql = [sql_query.format(s.id) for s in studies] |
|
|
379 |
sql = ' UNION '.join(sql) |
|
|
380 |
qdb.sql_connection.TRN.add(sql) |
|
|
381 |
|
|
|
382 |
# note that we are returning set to remove duplicates |
|
|
383 |
results = qdb.sql_connection.TRN.execute_fetchindex() |
|
|
384 |
|
|
|
385 |
return results |
|
|
386 |
|
|
|
387 |
|
|
|
388 |
def generate_biom_and_metadata_release(study_status='public'): |
|
|
389 |
"""Generate a list of biom/meatadata filepaths and a tgz of those files |
|
|
390 |
|
|
|
391 |
Parameters |
|
|
392 |
---------- |
|
|
393 |
study_status : str, optional |
|
|
394 |
The study status to search for. Note that this should always be set |
|
|
395 |
to 'public' but having this exposed helps with testing. The other |
|
|
396 |
options are 'private' and 'sandbox' |
|
|
397 |
""" |
|
|
398 |
studies = qdb.study.Study.get_by_status(study_status) |
|
|
399 |
qiita_config = ConfigurationManager() |
|
|
400 |
working_dir = qiita_config.working_dir |
|
|
401 |
portal = qiita_config.portal |
|
|
402 |
bdir = qdb.util.get_db_files_base_dir() |
|
|
403 |
time = datetime.now().strftime('%m-%d-%y %H:%M:%S') |
|
|
404 |
|
|
|
405 |
data = [] |
|
|
406 |
for s in studies: |
|
|
407 |
# [0] latest is first, [1] only getting the filepath |
|
|
408 |
sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir) |
|
|
409 |
|
|
|
410 |
for a in s.artifacts(artifact_type='BIOM'): |
|
|
411 |
if a.processing_parameters is None or a.visibility != study_status: |
|
|
412 |
continue |
|
|
413 |
|
|
|
414 |
merging_schemes, parent_softwares = a.merging_scheme |
|
|
415 |
software = a.processing_parameters.command.software |
|
|
416 |
software = '%s v%s' % (software.name, software.version) |
|
|
417 |
|
|
|
418 |
for x in a.filepaths: |
|
|
419 |
if x['fp_type'] != 'biom' or 'only-16s' in x['fp']: |
|
|
420 |
continue |
|
|
421 |
fp = relpath(x['fp'], bdir) |
|
|
422 |
for pt in a.prep_templates: |
|
|
423 |
categories = pt.categories |
|
|
424 |
platform = '' |
|
|
425 |
target_gene = '' |
|
|
426 |
if 'platform' in categories: |
|
|
427 |
platform = ', '.join( |
|
|
428 |
set(pt.get_category('platform').values())) |
|
|
429 |
if 'target_gene' in categories: |
|
|
430 |
target_gene = ', '.join( |
|
|
431 |
set(pt.get_category('target_gene').values())) |
|
|
432 |
for _, prep_fp in pt.get_filepaths(): |
|
|
433 |
if 'qiime' not in prep_fp: |
|
|
434 |
break |
|
|
435 |
prep_fp = relpath(prep_fp, bdir) |
|
|
436 |
# format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id, |
|
|
437 |
# platform, target gene, merging schemes, |
|
|
438 |
# artifact software/version, |
|
|
439 |
# parent sofware/version) |
|
|
440 |
data.append((fp, sample_fp, prep_fp, a.id, platform, |
|
|
441 |
target_gene, merging_schemes, software, |
|
|
442 |
parent_softwares)) |
|
|
443 |
|
|
|
444 |
# writing text and tgz file |
|
|
445 |
ts = datetime.now().strftime('%m%d%y-%H%M%S') |
|
|
446 |
tgz_dir = join(working_dir, 'releases') |
|
|
447 |
create_nested_path(tgz_dir) |
|
|
448 |
tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status)) |
|
|
449 |
tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status)) |
|
|
450 |
txt_lines = [ |
|
|
451 |
"biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t" |
|
|
452 |
"target gene\tmerging scheme\tartifact software\tparent software"] |
|
|
453 |
with topen(tgz_name, "w|gz") as tgz: |
|
|
454 |
for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data: |
|
|
455 |
txt_lines.append("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( |
|
|
456 |
biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv)) |
|
|
457 |
tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False) |
|
|
458 |
tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False) |
|
|
459 |
tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False) |
|
|
460 |
info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts)) |
|
|
461 |
txt_hd = BytesIO() |
|
|
462 |
txt_hd.write(bytes('\n'.join(txt_lines), 'ascii')) |
|
|
463 |
txt_hd.seek(0) |
|
|
464 |
info.size = len(txt_hd.read()) |
|
|
465 |
txt_hd.seek(0) |
|
|
466 |
tgz.addfile(tarinfo=info, fileobj=txt_hd) |
|
|
467 |
|
|
|
468 |
with open(tgz_name, "rb") as f: |
|
|
469 |
md5sum = md5() |
|
|
470 |
for c in iter(lambda: f.read(4096), b""): |
|
|
471 |
md5sum.update(c) |
|
|
472 |
|
|
|
473 |
move(tgz_name, tgz_name_final) |
|
|
474 |
|
|
|
475 |
vals = [ |
|
|
476 |
('filepath', tgz_name_final[len(working_dir):], r_client.set), |
|
|
477 |
('md5sum', md5sum.hexdigest(), r_client.set), |
|
|
478 |
('time', time, r_client.set)] |
|
|
479 |
for k, v, f in vals: |
|
|
480 |
redis_key = '%s:release:%s:%s' % (portal, study_status, k) |
|
|
481 |
# important to "flush" variables to avoid errors |
|
|
482 |
r_client.delete(redis_key) |
|
|
483 |
f(redis_key, v) |
|
|
484 |
|
|
|
485 |
|
|
|
486 |
def generate_plugin_releases(): |
|
|
487 |
"""Generate releases for plugins |
|
|
488 |
""" |
|
|
489 |
ARCHIVE = qdb.archive.Archive |
|
|
490 |
qiita_config = ConfigurationManager() |
|
|
491 |
working_dir = qiita_config.working_dir |
|
|
492 |
|
|
|
493 |
commands = [c for s in qdb.software.Software.iter(active=True) |
|
|
494 |
for c in s.commands if c.post_processing_cmd is not None] |
|
|
495 |
|
|
|
496 |
tnow = datetime.now() |
|
|
497 |
ts = tnow.strftime('%m%d%y-%H%M%S') |
|
|
498 |
tgz_dir = join(working_dir, 'releases', 'archive') |
|
|
499 |
create_nested_path(tgz_dir) |
|
|
500 |
tgz_dir_release = join(tgz_dir, ts) |
|
|
501 |
create_nested_path(tgz_dir_release) |
|
|
502 |
for cmd in commands: |
|
|
503 |
cmd_name = cmd.name |
|
|
504 |
mschemes = [v for _, v in ARCHIVE.merging_schemes().items() |
|
|
505 |
if cmd_name in v] |
|
|
506 |
for ms in mschemes: |
|
|
507 |
ms_name = sub('[^0-9a-zA-Z]+', '', ms) |
|
|
508 |
ms_fp = join(tgz_dir_release, ms_name) |
|
|
509 |
create_nested_path(ms_fp) |
|
|
510 |
|
|
|
511 |
pfp = join(ms_fp, 'archive.json') |
|
|
512 |
archives = {k: loads(v) |
|
|
513 |
for k, v in ARCHIVE.retrieve_feature_values( |
|
|
514 |
archive_merging_scheme=ms).items() |
|
|
515 |
if v != ''} |
|
|
516 |
with open(pfp, 'w') as f: |
|
|
517 |
dump(archives, f) |
|
|
518 |
|
|
|
519 |
# now let's run the post_processing_cmd |
|
|
520 |
ppc = cmd.post_processing_cmd |
|
|
521 |
|
|
|
522 |
# concatenate any other parameters into a string |
|
|
523 |
params = ' '.join(["%s=%s" % (k, v) for k, v in |
|
|
524 |
ppc['script_params'].items()]) |
|
|
525 |
# append archives file and output dir parameters |
|
|
526 |
params = ("%s --fp_archive=%s --output_dir=%s" % ( |
|
|
527 |
params, pfp, ms_fp)) |
|
|
528 |
|
|
|
529 |
ppc_cmd = "%s %s %s" % ( |
|
|
530 |
ppc['script_env'], ppc['script_path'], params) |
|
|
531 |
p_out, p_err, rv = qdb.processing_job._system_call(ppc_cmd) |
|
|
532 |
p_out = p_out.rstrip() |
|
|
533 |
if rv != 0: |
|
|
534 |
raise ValueError('Error %d: %s' % (rv, p_out)) |
|
|
535 |
p_out = loads(p_out) |
|
|
536 |
|
|
|
537 |
# tgz-ing all files |
|
|
538 |
tgz_name = join(tgz_dir, 'archive-%s-building.tgz' % ts) |
|
|
539 |
tgz_name_final = join(tgz_dir, 'archive.tgz') |
|
|
540 |
with topen(tgz_name, "w|gz") as tgz: |
|
|
541 |
tgz.add(tgz_dir_release, arcname=basename(tgz_dir_release)) |
|
|
542 |
# getting the release md5 |
|
|
543 |
with open(tgz_name, "rb") as f: |
|
|
544 |
md5sum = md5() |
|
|
545 |
for c in iter(lambda: f.read(4096), b""): |
|
|
546 |
md5sum.update(c) |
|
|
547 |
move(tgz_name, tgz_name_final) |
|
|
548 |
vals = [ |
|
|
549 |
('filepath', tgz_name_final[len(working_dir):], r_client.set), |
|
|
550 |
('md5sum', md5sum.hexdigest(), r_client.set), |
|
|
551 |
('time', tnow.strftime('%m-%d-%y %H:%M:%S'), r_client.set)] |
|
|
552 |
for k, v, f in vals: |
|
|
553 |
redis_key = 'release-archive:%s' % k |
|
|
554 |
# important to "flush" variables to avoid errors |
|
|
555 |
r_client.delete(redis_key) |
|
|
556 |
f(redis_key, v) |
|
|
557 |
|
|
|
558 |
|
|
|
559 |
def get_software_commands(active): |
|
|
560 |
software_list = [s for s in qdb.software.Software.iter(active=active)] |
|
|
561 |
software_commands = defaultdict(lambda: defaultdict(list)) |
|
|
562 |
|
|
|
563 |
for software in software_list: |
|
|
564 |
sname = software.name |
|
|
565 |
sversion = software.version |
|
|
566 |
commands = software.commands |
|
|
567 |
|
|
|
568 |
for command in commands: |
|
|
569 |
software_commands[sname][sversion].append(command.name) |
|
|
570 |
software_commands[sname] = dict(software_commands[sname]) |
|
|
571 |
|
|
|
572 |
return dict(software_commands) |
|
|
573 |
|
|
|
574 |
|
|
|
575 |
def update_resource_allocation_redis(active=True): |
|
|
576 |
"""Updates redis with plots and information about current software. |
|
|
577 |
|
|
|
578 |
Parameters |
|
|
579 |
---------- |
|
|
580 |
active: boolean, optional |
|
|
581 |
Defaults to True. Should only be False when testing. |
|
|
582 |
|
|
|
583 |
""" |
|
|
584 |
time = datetime.now().strftime('%m-%d-%y') |
|
|
585 |
scommands = get_software_commands(active) |
|
|
586 |
redis_key = 'resources:commands' |
|
|
587 |
r_client.set(redis_key, str(scommands)) |
|
|
588 |
|
|
|
589 |
for sname, versions in scommands.items(): |
|
|
590 |
for version, commands in versions.items(): |
|
|
591 |
for cname in commands: |
|
|
592 |
col_name = "samples * columns" |
|
|
593 |
df = retrieve_resource_data(cname, sname, version, COLUMNS) |
|
|
594 |
if len(df) == 0: |
|
|
595 |
continue |
|
|
596 |
|
|
|
597 |
fig, axs = resource_allocation_plot(df, col_name) |
|
|
598 |
titles = [0, 0] |
|
|
599 |
images = [0, 0] |
|
|
600 |
|
|
|
601 |
# Splitting 1 image plot into 2 separate for better layout. |
|
|
602 |
for i, ax in enumerate(axs): |
|
|
603 |
titles[i] = ax.get_title() |
|
|
604 |
ax.set_title("") |
|
|
605 |
# new_fig, new_ax – copy with either only memory plot or |
|
|
606 |
# only time |
|
|
607 |
new_fig = plt.figure() |
|
|
608 |
new_ax = new_fig.add_subplot(111) |
|
|
609 |
line = ax.lines[0] |
|
|
610 |
new_ax.plot(line.get_xdata(), line.get_ydata(), |
|
|
611 |
linewidth=1, color='orange') |
|
|
612 |
handles, labels = ax.get_legend_handles_labels() |
|
|
613 |
for handle, label, scatter_data in zip(handles, |
|
|
614 |
labels, |
|
|
615 |
ax.collections): |
|
|
616 |
color = handle.get_facecolor() |
|
|
617 |
new_ax.scatter(scatter_data.get_offsets()[:, 0], |
|
|
618 |
scatter_data.get_offsets()[:, 1], |
|
|
619 |
s=scatter_data.get_sizes(), label=label, |
|
|
620 |
color=color) |
|
|
621 |
|
|
|
622 |
new_ax.set_xscale('log') |
|
|
623 |
new_ax.set_yscale('log') |
|
|
624 |
new_ax.set_xlabel(ax.get_xlabel()) |
|
|
625 |
new_ax.set_ylabel(ax.get_ylabel()) |
|
|
626 |
new_ax.legend(loc='upper left') |
|
|
627 |
|
|
|
628 |
new_fig.tight_layout() |
|
|
629 |
plot = BytesIO() |
|
|
630 |
new_fig.savefig(plot, format='png') |
|
|
631 |
plot.seek(0) |
|
|
632 |
img = 'data:image/png;base64,' + quote( |
|
|
633 |
b64encode(plot.getvalue()).decode('ascii')) |
|
|
634 |
images[i] = img |
|
|
635 |
plt.close(new_fig) |
|
|
636 |
plt.close(fig) |
|
|
637 |
|
|
|
638 |
# SID, CID, col_name |
|
|
639 |
values = [ |
|
|
640 |
("img_mem", images[0], r_client.set), |
|
|
641 |
("img_time", images[1], r_client.set), |
|
|
642 |
('time', time, r_client.set), |
|
|
643 |
("title_mem", titles[0], r_client.set), |
|
|
644 |
("title_time", titles[1], r_client.set) |
|
|
645 |
] |
|
|
646 |
|
|
|
647 |
for k, v, f in values: |
|
|
648 |
redis_key = 'resources$#%s$#%s$#%s$#%s:%s' % ( |
|
|
649 |
cname, sname, version, col_name, k) |
|
|
650 |
r_client.delete(redis_key) |
|
|
651 |
f(redis_key, v) |