qiita / Git / Diff of /qiita_db/metadata

Models:
AlyssaS/
qiita
Downloads: 1
Diff of /qiita_db/metadata_template/util.py [000000] .. [879b32]
Switch to side-by-side view

--- a
+++ b/qiita_db/metadata_template/util.py
@@ -0,0 +1,399 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2014--, The Qiita Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# -----------------------------------------------------------------------------
+from six import StringIO
+
+import pandas as pd
+import numpy as np
+import warnings
+from iteration_utilities import duplicates
+
+import qiita_db as qdb
+
+from string import ascii_letters, digits
+
+
+def prefix_sample_names_with_id(md_template, study_id):
+    r"""prefix the sample_names in md_template with the study id
+
+    Parameters
+    ----------
+    md_template : DataFrame
+        The metadata template to modify
+    study_id : int
+        The study to which the metadata belongs to
+    """
+    # loop over the samples and prefix those that aren't prefixed
+    sid = str(study_id)
+    md_template['qiita_sample_name_with_id'] = pd.Series(
+        [idx if idx.split('.', 1)[0] == sid and idx != sid
+         else '%d.%s' % (study_id, idx)
+         for idx in md_template.index], index=md_template.index)
+
+    # get the rows that are going to change
+    changes = len(md_template.index[
+        md_template['qiita_sample_name_with_id'] != md_template.index])
+    if changes != 0 and changes != len(md_template.index):
+        warnings.warn(
+            "Some of the samples were already prefixed with the study id.",
+            qdb.exceptions.QiitaDBWarning)
+
+    md_template.index = md_template.qiita_sample_name_with_id
+    del md_template['qiita_sample_name_with_id']
+    # The original metadata template had the index column unnamed -> remove
+    # the name of the index for consistency
+    md_template.index.name = None
+
+
+def load_template_to_dataframe(fn, index='sample_name'):
+    """Load a sample/prep template or a QIIME mapping file into a data frame
+
+    Parameters
+    ----------
+    fn : str or file-like object
+        filename of the template to load, or an already open template file
+    index : str, optional
+        Defaults to 'sample_name'. The index to use in the loaded information
+
+    Returns
+    -------
+    DataFrame
+        Pandas dataframe with the loaded information
+
+    Raises
+    ------
+    ValueError
+        Empty file passed
+    QiitaDBColumnError
+        If the sample_name column is not present in the template.
+    QiitaDBWarning
+        When columns are dropped because they have no content for any sample.
+    QiitaDBError
+        When non UTF-8 characters are found in the file.
+    QiitaDBDuplicateHeaderError
+        If duplicate columns are present in the template
+
+    Notes
+    -----
+    The index attribute of the DataFrame will be forced to be 'sample_name'
+    and will be cast to a string. Additionally rows that start with a '\t'
+    character will be ignored and columns that are empty will be removed. Empty
+    sample names will be removed from the DataFrame.
+
+    Column names are case-insensitive but will be lowercased on addition to
+    the database
+
+    Everything in the DataFrame will be read and managed as string
+
+    While reading the file via pandas, it's possible that it will raise a
+    'tokenizing' pd.errors.ParserError which is confusing for users; thus,
+    rewriting the error with an explanation of what it means and how to fix.
+    """
+    # Load in file lines
+    holdfile = None
+    with qdb.util.open_file(fn, newline=None,
+                            encoding="utf8", errors='ignore') as f:
+        holdfile = f.readlines()
+
+    if not holdfile:
+        raise ValueError('Empty file passed!')
+
+    if index == "#SampleID":
+        # We're going to parse a QIIME mapping file. We are going to first
+        # parse it with the QIIME function so we can remove the comments
+        # easily and make sure that QIIME will accept this as a mapping file
+        data, headers, comments = _parse_mapping_file(holdfile)
+        holdfile = ["%s\n" % '\t'.join(d) for d in data]
+        holdfile.insert(0, "%s\n" % '\t'.join(headers))
+        # The QIIME parser fixes the index and removes the #
+        index = 'SampleID'
+
+    # Strip all values in the cells in the input file
+    for pos, line in enumerate(holdfile):
+        cols = line.split('\t')
+        if pos == 0 and index != 'SampleID':
+            # get and clean the controlled columns
+            ccols = {'sample_name'}
+            ccols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
+            newcols = [
+                c.lower().strip() if c.lower().strip() in ccols
+                else c.strip()
+                for c in cols]
+
+            # while we are here, let's check for duplicate columns headers
+            ncols = set(newcols)
+            if len(ncols) != len(newcols):
+                if '' in ncols:
+                    raise ValueError(
+                        'Your file has empty columns headers.')
+                raise qdb.exceptions.QiitaDBDuplicateHeaderError(
+                    set(duplicates(newcols)))
+        else:
+            # .strip will remove odd chars, newlines, tabs and multiple
+            # spaces but we need to read a new line at the end of the
+            # line(+'\n')
+            newcols = [d.strip(" \r\n") for d in cols]
+
+        holdfile[pos] = '\t'.join(newcols) + '\n'
+
+    # index_col:
+    #   is set as False, otherwise it is cast as a float and we want a string
+    # keep_default:
+    #   is set as False, to avoid inferring empty/NA values with the defaults
+    #   that Pandas has.
+    # comment:
+    #   using the tab character as "comment" we remove rows that are
+    #   constituted only by delimiters i. e. empty rows.
+    try:
+        template = pd.read_csv(
+            StringIO(''.join(holdfile)),
+            sep='\t',
+            dtype=str,
+            encoding='utf-8',
+            keep_default_na=False,
+            index_col=False,
+            comment='\t',
+            converters={index: lambda x: str(x).strip()})
+    except pd.errors.ParserError as e:
+        if 'tokenizing' in str(e):
+            msg = ('Your file has more columns with values than headers. To '
+                   'fix, make sure to delete any extra rows or columns; they '
+                   'might look empty because they have spaces. Then upload '
+                   'and try again.')
+            raise RuntimeError(msg)
+        else:
+            raise e
+    # remove newlines and tabs from fields
+    template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
+                     regex=True, inplace=True)
+    # removing columns with empty values
+    template.dropna(axis='columns', how='all', inplace=True)
+    if template.empty:
+        raise ValueError("The template is empty")
+
+    initial_columns = set(template.columns)
+
+    if index not in template.columns:
+        raise qdb.exceptions.QiitaDBColumnError(
+            "The '%s' column is missing from your template, this file cannot "
+            "be parsed." % index)
+
+    # remove rows that have no sample identifier but that may have other data
+    # in the rest of the columns
+    template.dropna(subset=[index], how='all', inplace=True)
+
+    # set the sample name as the index
+    template.set_index(index, inplace=True)
+
+    # it is not uncommon to find templates that have empty columns so let's
+    # find the columns that are all ''
+    columns = np.where(np.all(template.applymap(lambda x: x == ''), axis=0))
+    template.drop(template.columns[columns], axis=1, inplace=True)
+
+    initial_columns.remove(index)
+    dropped_cols = initial_columns - set(template.columns)
+    if dropped_cols:
+        warnings.warn(
+            'The following column(s) were removed from the template because '
+            'all their values are empty: %s'
+            % ', '.join(dropped_cols), qdb.exceptions.QiitaDBWarning)
+
+    # removing 'sample-id' and 'sample_id' as per issue #2906
+    sdrop = []
+    if 'sample-id' in template.columns:
+        sdrop.append('sample-id')
+    if 'sample_id' in template.columns:
+        sdrop.append('sample_id')
+    if sdrop:
+        template.drop(columns=sdrop, inplace=True)
+        warnings.warn(
+            'The following column(s) were removed from the template because '
+            'they will cause conflicts with sample_name: %s'
+            % ', '.join(sdrop), qdb.exceptions.QiitaDBWarning)
+
+    # Pandas represents data with np.nan rather than Nones, change it to None
+    # because psycopg2 knows that a None is a Null in SQL, while it doesn't
+    # know what to do with NaN
+    template = template.where((pd.notnull(template)), None)
+
+    return template
+
+
+def get_invalid_sample_names(sample_names):
+    """Get a list of sample names that are not QIIME compliant
+
+    Parameters
+    ----------
+    sample_names : iterable
+        Iterable containing the sample names to check.
+
+    Returns
+    -------
+    list
+        List of str objects where each object is an invalid sample name.
+
+    References
+    ----------
+    .. [1] QIIME File Types documentaiton:
+    http://qiime.org/documentation/file_formats.html#mapping-file-overview.
+    """
+
+    # from the QIIME mapping file documentation
+    valid = set(ascii_letters+digits+'.')
+    inv = []
+
+    for s in sample_names:
+        if set(s) - valid:
+            inv.append(s)
+
+    return inv
+
+
+def looks_like_qiime_mapping_file(fp):
+    """Checks if the file looks like a QIIME mapping file
+
+    Parameters
+    ----------
+    fp : str or file-like object
+        filepath to check if it looks like a QIIME mapping file
+
+    Returns
+    -------
+    bool
+        True if fp looks like a QIIME mapping file, false otherwise.
+
+
+    Notes
+    -----
+    This is not doing a validation of the QIIME mapping file. It simply checks
+    the first line in the file and it returns true if the line starts with
+    '#SampleID', since a sample/prep template will start with 'sample_name' or
+    some other different column.
+    """
+    first_line = None
+    with qdb.util.open_file(fp, newline=None, errors='replace') as f:
+        first_line = f.readline()
+    if not first_line:
+        return False
+
+    first_col = first_line.split()[0]
+    return first_col == '#SampleID'
+
+
+def _parse_mapping_file(lines, strip_quotes=True, suppress_stripping=False):
+    """Parser for map file that relates samples to metadata.
+
+    Format: header line with fields
+            optionally other comment lines starting with #
+            tab-delimited fields
+
+    Parameters
+    ----------
+    lines : iterable of str
+        The contents of the QIIME mapping file
+    strip_quotes : bool, optional
+        Defaults to true. If true, quotes are removed from the data
+    suppress_stripping : bool, optional
+        Defaults to false. If true, spaces are not stripped
+
+    Returns
+    -------
+    list of lists, list of str, list of str
+        The data in the mapping file, the headers and the comments
+
+    Raises
+    ------
+    QiitaDBError
+        If there is any error parsing the mapping file
+
+    Notes
+    -----
+    This code has been ported from QIIME.
+    """
+    if strip_quotes:
+        if suppress_stripping:
+            # remove quotes but not spaces
+
+            def strip_f(x):
+                return x.replace('"', '')
+        else:
+            # remove quotes and spaces
+
+            def strip_f(x):
+                return x.replace('"', '').strip()
+    else:
+        if suppress_stripping:
+            # don't remove quotes or spaces
+
+            def strip_f(x):
+                return x
+        else:
+            # remove spaces but not quotes
+
+            def strip_f(x):
+                return x.strip()
+
+    # Create lists to store the results
+    mapping_data = []
+    header = []
+    comments = []
+
+    # Begin iterating over lines
+    for line in lines:
+        line = strip_f(line)
+        if not line or (suppress_stripping and not line.strip()):
+            # skip blank lines when not stripping lines
+            continue
+
+        if line.startswith('#'):
+            line = line[1:]
+            if not header:
+                header = line.strip().split('\t')
+            else:
+                comments.append(line)
+        else:
+            # Will add empty string to empty fields
+            tmp_line = list(map(strip_f, line.split('\t')))
+            if len(tmp_line) < len(header):
+                tmp_line.extend([''] * (len(header) - len(tmp_line)))
+            mapping_data.append(tmp_line)
+    if not header:
+        raise qdb.exceptions.QiitaDBError(
+            "No header line was found in mapping file.")
+    if not mapping_data:
+        raise qdb.exceptions.QiitaDBError(
+            "No data found in mapping file.")
+
+    return mapping_data, header, comments
+
+
+def get_pgsql_reserved_words():
+    """Returns a list of the current reserved words in pgsql
+
+    Returns
+    -------
+    set: str
+        The reserved words
+    """
+    with qdb.sql_connection.TRN:
+        sql = "SELECT word FROM pg_get_keywords() WHERE catcode = 'R';"
+        qdb.sql_connection.TRN.add(sql)
+        return set(qdb.sql_connection.TRN.execute_fetchflatten())
+
+
+def get_qiime2_reserved_words():
+    """Returns a list of the current reserved words in qiime2
+
+    Returns
+    -------
+    set: str
+        The reserved words
+    """
+    qiime2_reserved_column_names = ['feature id', 'feature-id', 'featureid',
+                                    'id', 'sample id', 'sample-id', 'sampleid']
+
+    return set(qiime2_reserved_column_names)