exSEEK / Git / Diff of /exseek/scripts/extract

Models:
DanielG/
exSEEK
Downloads: 1
Diff of /exseek/scripts/extract_bigwig.py [000000] .. [4c33d4]
Switch to side-by-side view

--- a
+++ b/exseek/scripts/extract_bigwig.py
@@ -0,0 +1,86 @@
+#! /usr/bin/env python
+from __future__ import print_function
+import argparse, sys, os, errno
+import logging
+logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
+
+command_handlers = {}
+def command_handler(f):
+    command_handlers[f.__name__] = f
+    return f
+
+@command_handler
+def abundant_rna_coverage_matrix(args):
+    import pandas as pd
+    import numpy as np
+    from tqdm import tqdm
+    import h5py
+    #from bx.bbi.bigwig_file import BigWigFile
+    from pykent import BigWigFile
+
+    logger.info('read count matrix: ' + args.matrix)
+    count_matrix = pd.read_table(args.matrix, sep='\t', index_col=0)
+    sample_ids = count_matrix.columns.values
+    count_matrix += 1
+    read_depth = count_matrix.sum(axis=0)
+    cpm = 1e6*count_matrix.div(read_depth, axis=1)
+    cpm_mean = np.mean(np.log2(cpm), axis=1)
+    # remove miRNA, piRNA, tRNA and genomic RNA
+    feature_info = cpm_mean.index.to_series().str.split('|', expand=True)
+    feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'domain_id', 'transcript_id', 'start', 'end']
+    feature_info['start'] = feature_info['start'].astype('int')
+    feature_info['end'] = feature_info['end'].astype('int')
+    cpm_mean = cpm_mean.loc[~feature_info['gene_type'].isin(['piRNA', 'miRNA', 'tRNA', 'genomic'])].copy()
+    # sort RNAs by mean log CPM
+    order = np.argsort(-cpm_mean)
+    # get top n abundant RNAs
+    cpm_mean = cpm_mean.iloc[order].head(args.n)
+    feature_info = feature_info.loc[cpm_mean.index.values]
+    
+    # initialize matrix
+    matrices = {}
+    for feature_name, feature in feature_info.iterrows():
+        matrices[feature_name] = np.zeros((len(sample_ids), feature['end']), dtype=np.float32)
+    # read bigwig files
+    logger.info('read coverage from bigwig files')
+    for i_sample, sample_id in tqdm(enumerate(sample_ids), total=len(sample_ids), unit='sample'):
+        bigwigs  = {}
+        for feature_name, feature in feature_info.iterrows():
+            if feature['gene_type'] not in bigwigs:
+                #bigwigs[feature['gene_type']] = BigWigFile(open(args.bigwig_pattern.format(sample_id=sample_id, gene_type=feature['gene_type']), 'rb'))
+                bigwigs[feature['gene_type']] = BigWigFile(args.bigwig_pattern.format(sample_id=sample_id, gene_type=feature['gene_type']))
+            #values = bigwigs[feature['gene_type']].get_as_array(
+            #    str(feature['transcript_id']), feature['start'], feature['end'])
+            values = bigwigs[feature['gene_type']].query(str(feature['transcript_id']))
+            if values is not None:
+                matrices[feature_name][i_sample, ~np.isnan(values)] = 1e6*values[~np.isnan(values)]/read_depth[sample_id]
+        del bigwigs
+    logger.info('write matrices to file: ' + args.output_file)
+    with h5py.File(args.output_file, 'w') as fout:
+        for feature_name, matrix in tqdm(matrices.items(), unit='matrix'):
+            fout.create_dataset(feature_name, data=matrix, compression='gzip')
+    fout.close()
+    
+if __name__ == '__main__':
+    main_parser = argparse.ArgumentParser(description='Count reads in BAM files')
+    subparsers = main_parser.add_subparsers(dest='command')
+
+    parser = subparsers.add_parser('abundant_rna_coverage_matrix',
+        help='Extract coverage from abundant long RNAs and build a matrix')
+    parser.add_argument('--matrix', type=str, required=True,
+        help='count matrix file')
+    parser.add_argument('--bigwig-pattern', type=str, required=True,
+        help='string format template for bigwig names.'
+        'e.g. output/dataset/tbigwig/{sample_id}.{gene_type}.bigWig')
+    parser.add_argument('-n', type=int, default=10, 
+        help='number of abundant genes to extract')
+    parser.add_argument('--output-file', '-o', type=str, required=True,
+        help='output matrices in HDF5 format')
+
+    args = main_parser.parse_args()
+    if args.command is None:
+        main_parser.print_help()
+        sys.exit(1)
+    logger = logging.getLogger('extract_bigwig.' + args.command)
+
+    command_handlers.get(args.command)(args)
\ No newline at end of file