Download this file

400 lines (330 with data), 13.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
# -----------------------------------------------------------------------------
# Copyright (c) 2014--, The Qiita Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------
from six import StringIO
import pandas as pd
import numpy as np
import warnings
from iteration_utilities import duplicates
import qiita_db as qdb
from string import ascii_letters, digits
def prefix_sample_names_with_id(md_template, study_id):
r"""prefix the sample_names in md_template with the study id
Parameters
----------
md_template : DataFrame
The metadata template to modify
study_id : int
The study to which the metadata belongs to
"""
# loop over the samples and prefix those that aren't prefixed
sid = str(study_id)
md_template['qiita_sample_name_with_id'] = pd.Series(
[idx if idx.split('.', 1)[0] == sid and idx != sid
else '%d.%s' % (study_id, idx)
for idx in md_template.index], index=md_template.index)
# get the rows that are going to change
changes = len(md_template.index[
md_template['qiita_sample_name_with_id'] != md_template.index])
if changes != 0 and changes != len(md_template.index):
warnings.warn(
"Some of the samples were already prefixed with the study id.",
qdb.exceptions.QiitaDBWarning)
md_template.index = md_template.qiita_sample_name_with_id
del md_template['qiita_sample_name_with_id']
# The original metadata template had the index column unnamed -> remove
# the name of the index for consistency
md_template.index.name = None
def load_template_to_dataframe(fn, index='sample_name'):
"""Load a sample/prep template or a QIIME mapping file into a data frame
Parameters
----------
fn : str or file-like object
filename of the template to load, or an already open template file
index : str, optional
Defaults to 'sample_name'. The index to use in the loaded information
Returns
-------
DataFrame
Pandas dataframe with the loaded information
Raises
------
ValueError
Empty file passed
QiitaDBColumnError
If the sample_name column is not present in the template.
QiitaDBWarning
When columns are dropped because they have no content for any sample.
QiitaDBError
When non UTF-8 characters are found in the file.
QiitaDBDuplicateHeaderError
If duplicate columns are present in the template
Notes
-----
The index attribute of the DataFrame will be forced to be 'sample_name'
and will be cast to a string. Additionally rows that start with a '\t'
character will be ignored and columns that are empty will be removed. Empty
sample names will be removed from the DataFrame.
Column names are case-insensitive but will be lowercased on addition to
the database
Everything in the DataFrame will be read and managed as string
While reading the file via pandas, it's possible that it will raise a
'tokenizing' pd.errors.ParserError which is confusing for users; thus,
rewriting the error with an explanation of what it means and how to fix.
"""
# Load in file lines
holdfile = None
with qdb.util.open_file(fn, newline=None,
encoding="utf8", errors='ignore') as f:
holdfile = f.readlines()
if not holdfile:
raise ValueError('Empty file passed!')
if index == "#SampleID":
# We're going to parse a QIIME mapping file. We are going to first
# parse it with the QIIME function so we can remove the comments
# easily and make sure that QIIME will accept this as a mapping file
data, headers, comments = _parse_mapping_file(holdfile)
holdfile = ["%s\n" % '\t'.join(d) for d in data]
holdfile.insert(0, "%s\n" % '\t'.join(headers))
# The QIIME parser fixes the index and removes the #
index = 'SampleID'
# Strip all values in the cells in the input file
for pos, line in enumerate(holdfile):
cols = line.split('\t')
if pos == 0 and index != 'SampleID':
# get and clean the controlled columns
ccols = {'sample_name'}
ccols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
newcols = [
c.lower().strip() if c.lower().strip() in ccols
else c.strip()
for c in cols]
# while we are here, let's check for duplicate columns headers
ncols = set(newcols)
if len(ncols) != len(newcols):
if '' in ncols:
raise ValueError(
'Your file has empty columns headers.')
raise qdb.exceptions.QiitaDBDuplicateHeaderError(
set(duplicates(newcols)))
else:
# .strip will remove odd chars, newlines, tabs and multiple
# spaces but we need to read a new line at the end of the
# line(+'\n')
newcols = [d.strip(" \r\n") for d in cols]
holdfile[pos] = '\t'.join(newcols) + '\n'
# index_col:
# is set as False, otherwise it is cast as a float and we want a string
# keep_default:
# is set as False, to avoid inferring empty/NA values with the defaults
# that Pandas has.
# comment:
# using the tab character as "comment" we remove rows that are
# constituted only by delimiters i. e. empty rows.
try:
template = pd.read_csv(
StringIO(''.join(holdfile)),
sep='\t',
dtype=str,
encoding='utf-8',
keep_default_na=False,
index_col=False,
comment='\t',
converters={index: lambda x: str(x).strip()})
except pd.errors.ParserError as e:
if 'tokenizing' in str(e):
msg = ('Your file has more columns with values than headers. To '
'fix, make sure to delete any extra rows or columns; they '
'might look empty because they have spaces. Then upload '
'and try again.')
raise RuntimeError(msg)
else:
raise e
# remove newlines and tabs from fields
template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
regex=True, inplace=True)
# removing columns with empty values
template.dropna(axis='columns', how='all', inplace=True)
if template.empty:
raise ValueError("The template is empty")
initial_columns = set(template.columns)
if index not in template.columns:
raise qdb.exceptions.QiitaDBColumnError(
"The '%s' column is missing from your template, this file cannot "
"be parsed." % index)
# remove rows that have no sample identifier but that may have other data
# in the rest of the columns
template.dropna(subset=[index], how='all', inplace=True)
# set the sample name as the index
template.set_index(index, inplace=True)
# it is not uncommon to find templates that have empty columns so let's
# find the columns that are all ''
columns = np.where(np.all(template.applymap(lambda x: x == ''), axis=0))
template.drop(template.columns[columns], axis=1, inplace=True)
initial_columns.remove(index)
dropped_cols = initial_columns - set(template.columns)
if dropped_cols:
warnings.warn(
'The following column(s) were removed from the template because '
'all their values are empty: %s'
% ', '.join(dropped_cols), qdb.exceptions.QiitaDBWarning)
# removing 'sample-id' and 'sample_id' as per issue #2906
sdrop = []
if 'sample-id' in template.columns:
sdrop.append('sample-id')
if 'sample_id' in template.columns:
sdrop.append('sample_id')
if sdrop:
template.drop(columns=sdrop, inplace=True)
warnings.warn(
'The following column(s) were removed from the template because '
'they will cause conflicts with sample_name: %s'
% ', '.join(sdrop), qdb.exceptions.QiitaDBWarning)
# Pandas represents data with np.nan rather than Nones, change it to None
# because psycopg2 knows that a None is a Null in SQL, while it doesn't
# know what to do with NaN
template = template.where((pd.notnull(template)), None)
return template
def get_invalid_sample_names(sample_names):
"""Get a list of sample names that are not QIIME compliant
Parameters
----------
sample_names : iterable
Iterable containing the sample names to check.
Returns
-------
list
List of str objects where each object is an invalid sample name.
References
----------
.. [1] QIIME File Types documentaiton:
http://qiime.org/documentation/file_formats.html#mapping-file-overview.
"""
# from the QIIME mapping file documentation
valid = set(ascii_letters+digits+'.')
inv = []
for s in sample_names:
if set(s) - valid:
inv.append(s)
return inv
def looks_like_qiime_mapping_file(fp):
"""Checks if the file looks like a QIIME mapping file
Parameters
----------
fp : str or file-like object
filepath to check if it looks like a QIIME mapping file
Returns
-------
bool
True if fp looks like a QIIME mapping file, false otherwise.
Notes
-----
This is not doing a validation of the QIIME mapping file. It simply checks
the first line in the file and it returns true if the line starts with
'#SampleID', since a sample/prep template will start with 'sample_name' or
some other different column.
"""
first_line = None
with qdb.util.open_file(fp, newline=None, errors='replace') as f:
first_line = f.readline()
if not first_line:
return False
first_col = first_line.split()[0]
return first_col == '#SampleID'
def _parse_mapping_file(lines, strip_quotes=True, suppress_stripping=False):
"""Parser for map file that relates samples to metadata.
Format: header line with fields
optionally other comment lines starting with #
tab-delimited fields
Parameters
----------
lines : iterable of str
The contents of the QIIME mapping file
strip_quotes : bool, optional
Defaults to true. If true, quotes are removed from the data
suppress_stripping : bool, optional
Defaults to false. If true, spaces are not stripped
Returns
-------
list of lists, list of str, list of str
The data in the mapping file, the headers and the comments
Raises
------
QiitaDBError
If there is any error parsing the mapping file
Notes
-----
This code has been ported from QIIME.
"""
if strip_quotes:
if suppress_stripping:
# remove quotes but not spaces
def strip_f(x):
return x.replace('"', '')
else:
# remove quotes and spaces
def strip_f(x):
return x.replace('"', '').strip()
else:
if suppress_stripping:
# don't remove quotes or spaces
def strip_f(x):
return x
else:
# remove spaces but not quotes
def strip_f(x):
return x.strip()
# Create lists to store the results
mapping_data = []
header = []
comments = []
# Begin iterating over lines
for line in lines:
line = strip_f(line)
if not line or (suppress_stripping and not line.strip()):
# skip blank lines when not stripping lines
continue
if line.startswith('#'):
line = line[1:]
if not header:
header = line.strip().split('\t')
else:
comments.append(line)
else:
# Will add empty string to empty fields
tmp_line = list(map(strip_f, line.split('\t')))
if len(tmp_line) < len(header):
tmp_line.extend([''] * (len(header) - len(tmp_line)))
mapping_data.append(tmp_line)
if not header:
raise qdb.exceptions.QiitaDBError(
"No header line was found in mapping file.")
if not mapping_data:
raise qdb.exceptions.QiitaDBError(
"No data found in mapping file.")
return mapping_data, header, comments
def get_pgsql_reserved_words():
"""Returns a list of the current reserved words in pgsql
Returns
-------
set: str
The reserved words
"""
with qdb.sql_connection.TRN:
sql = "SELECT word FROM pg_get_keywords() WHERE catcode = 'R';"
qdb.sql_connection.TRN.add(sql)
return set(qdb.sql_connection.TRN.execute_fetchflatten())
def get_qiime2_reserved_words():
"""Returns a list of the current reserved words in qiime2
Returns
-------
set: str
The reserved words
"""
qiime2_reserved_column_names = ['feature id', 'feature-id', 'featureid',
'id', 'sample id', 'sample-id', 'sampleid']
return set(qiime2_reserved_column_names)