Diff of /exseek/scripts/utils.py [000000] .. [4c33d4]

Switch to side-by-side view

--- a
+++ b/exseek/scripts/utils.py
@@ -0,0 +1,95 @@
+from collections import namedtuple
+import numpy as np
+import inspect
+GFFRecord = namedtuple('GFFRecord', ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attr'])
+
+def read_gff(fin, zero_based=False):
+    '''Read a GFF file
+    Args:
+        fin: file-like object
+        zero_based: subtract start position by 1 if True
+    Yields:
+        feature: GFFRecord object
+    '''
+    for lineno, line in enumerate(fin):
+        if line.startswith('#'):
+            continue
+        c = line.strip().split('\t')
+        if len(c) < 9:
+            raise ValueError('less than 9 columns found in GFF file at line{}'.format(lineno + 1))
+        c[3] = int(c[3])
+        c[4] = int(c[4])
+        if zero_based:
+            c[3] -= 1
+        attrs = {}
+        for a in c[8].split(';'):
+            i = a.find('=')
+            key = a[:i]
+            val = a[(i + 1):]
+            attrs[key] = val
+        c[8] = attrs
+        yield GFFRecord._make(c)
+
+def search_dict(data, keys):
+    '''Intersect given keys with a dict and return the subset
+
+    Parameters:
+        data: dict
+        keys: list of keys to search
+    Returns:
+        dict
+    '''
+    return {key:data[key] for key in keys if key in data}
+
+def get_feature_importances(estimator):
+    from sklearn.feature_selection import RFE
+    from sklearn.neural_network import MLPClassifier
+    from sklearn.svm import SVC
+
+    '''Get feature importance attribute of an estimator
+    '''
+    if hasattr(estimator, 'coef_'):
+        return np.ravel(np.abs(estimator.coef_))
+    elif hasattr(estimator, 'feature_importances_'):
+        return np.ravel(estimator.feature_importances_)
+    elif isinstance(estimator, RFE):
+        ranking = estimator.ranking_.astype('float') - 1
+        return np.ravel(1.0 - ranking/ranking.max())
+    elif isinstance(estimator, MLPClassifier):
+        return np.zeros(estimator.coefs_[0].shape[0])
+    elif isinstance(estimator, SVC):
+        # SVC does not have feature importance
+        return np.zeros(estimator.support_vectors_.shape[1])
+    else:
+        raise ValueError('the estimator should have either coef_ or feature_importances_ attribute')
+
+def function_has_arg(func, arg):
+    return arg in inspect.getargspec(func).args
+
+def python_args_to_r_args(args):
+    def to_r_arg(s):
+        if isinstance(arg, str):
+            return "'" + arg + "'"
+        elif isinstance(arg, bool):
+            return 'TRUE' if arg else 'FALSE'
+        elif isinstance(arg, int):
+            return str(arg)
+        elif isinstance(arg, float):
+            return str(arg)
+        else:
+            raise TypeError('{}: unsupported type: {}'.format(arg, type(arg)))
+
+    r_args = {}
+    for name, arg in args.items():
+        is_atom_type = False
+        for atom_type in (str, int, bool, float):
+            if isinstance(arg, atom_type):
+                r_args[name] = to_r_arg(arg)
+                is_atom_type = True
+                break
+        if not is_atom_type:
+            if isinstance(arg, tuple) or isinstance(arg, list):
+                r_args[name] = 'c(' + ','.join(map(to_r_arg, arg)) + ')'
+            else:
+                raise TypeError('{}: unsupported type: {}'.format(arg, type(arg)))
+    return ', '.join(('%s=%s'%(k, v) for k, v in r_args.items()))
\ No newline at end of file