exSEEK / Git / Diff of /exseek/scripts/create

Models:
DanielG/
exSEEK
Downloads: 1
Diff of /exseek/scripts/create_igv.py [000000] .. [4c33d4]
Switch to side-by-side view

--- a
+++ b/exseek/scripts/create_igv.py
@@ -0,0 +1,328 @@
+#! /usr/bin/env python
+import argparse, sys, os, errno
+import logging
+import shutil
+logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
+from copy import deepcopy
+
+command_handlers = {}
+def command_handler(f):
+    command_handlers[f.__name__] = f
+    return f
+
+def color_palette(n_colors):
+    # list(map(matplotlib.colors.to_hex, sns.color_palette('deep', 10)))
+    colors = ['#4c72b0', '#dd8452', '#55a868', '#c44e52', '#8172b3',
+     '#937860', '#da8bc3', '#8c8c8c', '#ccb974', '#64b5cd']
+    palette = [colors[i%len(colors)] for i in range(n_colors)]
+    return palette
+
+def read_dict_path(d, path):
+    path = path.strip('/')
+    for v in path.split('/'):
+        d_new = d.get(v)
+        if d_new is not None:
+            d = d_new
+    return d
+
+def write_dict_path(d, path, value):
+    path = path.strip('/')
+    for v in path.split('/'):
+        d_new = d.get(v)
+        if d_new is None:
+            d[v] = {}
+            d_new = d
+    d[v] = value
+    return d
+
+@command_handler
+def generate_config(args):
+    #import pandas as pd
+    import yaml
+    from jinja2 import Template, Environment
+    from collections import defaultdict, OrderedDict
+
+    unique_classes = set()
+    sample_classes = OrderedDict()
+    samples_by_group = defaultdict(list)
+    with open(args.sample_classes, 'r') as f:
+        f.readline()
+        for line in f:
+            c = line.strip().split('\t')
+            sample_classes[c[0]] = c[1]
+            unique_classes.add(c[1])
+            samples_by_group[c[1]].append(c[0])
+    #sample_classes = pd.read_table(args.sample_classes, sep='\t', index_col=0, dtype='str').iloc[:, 0]
+    # set track colors
+    config = {}
+    config['options'] = {}
+    logger.info('load reference: ' + args.reference)
+    config.update(yaml.load(open(args.reference, 'r')))
+    for key in ('reference', 'genome'):
+        if key not in config['options']:
+            raise KeyError('key "{}" is not defined in reference file: {}'.format(key, args.reference))
+    config['options']['locus'] = args.locus
+    # define groups
+    config['groups'] = {}
+    colors = {}
+    
+    for sample_class, color in zip(unique_classes, color_palette(len(unique_classes))):
+        colors[sample_class] = color
+        config['groups'][sample_class] = dict(color=color, show=True)
+    # add tracks
+    order = 1
+    if 'tracks' not in config:
+        config['tracks'] = {}
+    else:
+        for name, track in config['tracks'].items():
+            track['order'] = order
+            order += 1
+    
+    strands = ['+', '-']
+    if args.strand is not None:
+        strands = [args.strand]
+    
+    track_extensions = {
+        'bigwig': {'type': 'wig', 'format': 'bigwig'},
+        'bam': {'type': 'alignment', 'format': 'bam'},
+        'bed': {'type': 'annotation', 'format': 'bed'},
+        'genepred': {'type': 'annotation', 'format': 'genepred'},
+        'genepredext': {'type': 'annotation', 'format': 'genepredext'},
+        'gff3': {'type': 'annotation', 'format': 'gff3'},
+        'gtf': {'type': 'annotation', 'format': 'gtf'}
+    }
+    track_extension = args.track_pattern.split('.')[-1].lower()
+    if track_extension not in track_extensions:
+        raise ValueError('unknown track file extension: {}'.format(track_extension))
+    track_type = track_extensions[track_extension]['type']
+    track_format = track_extensions[track_extension]['format']
+
+    for sample_class in unique_classes:
+        for sample_index, sample_id in enumerate(samples_by_group[sample_class]):
+            track_config = dict(
+                name=sample_id,
+                sample_id=sample_id,
+                url=args.track_pattern.format(sample_id=sample_id, strand='.'),
+                type=track_type,
+                format=track_format,
+                height=25,
+                group=sample_class,
+                strand='.',
+                order=order,
+                color=colors[sample_class],
+                autoscale=True,
+                show=False
+            )
+            # only show first N tracks
+            if sample_index < args.max_samples_per_class:
+                track_config['show'] = True
+            if track_type == 'wig':
+                # separate tracks by strand for wig type
+                for strand in strands:
+                    track_config_by_strand = deepcopy(track_config)
+                    track_config_by_strand['name'] = '{0}({1})'.format(sample_id, strand)
+                    track_config_by_strand['strand'] = strand
+                    track_config_by_strand['order'] = order
+                    track_config_by_strand['url'] = args.track_pattern.format(sample_id=sample_id, strand=strand)
+                    config['tracks'][track_config_by_strand['name']] = track_config_by_strand
+                    order += 1
+            else:
+                if track_config['format'] == 'bam':
+                    track_config['height'] = 400
+                    track_config['indexURL'] = track_config['url'] + '.bai'
+                config['tracks'][track_config['name']] = track_config
+                order += 1
+    # sort tracks
+    config['tracks'] = dict(sorted(config['tracks'].items(), key=lambda x: x[1]['order']))
+    # add base_url
+    for key in ('fastaURL', 'indexURL', 'cytobandURL'):
+        if key in config['options']['reference']:
+            config['options']['reference'][key] = args.base_url + '/' + config['options']['reference'][key]
+    if 'tracks' in config:
+        for name, track in config['tracks'].items():
+            for key in ('url', 'indexURL'):
+                if key in track:
+                    track[key] = args.base_url + '/' + track[key]
+    if 'search' in config['options']:
+        if 'url' in config['options']['search']:
+            config['options']['search']['url'] = args.base_url + '/' + config['options']['search']['url']
+    with open(args.output_file, 'w') as fout:
+        yaml.dump(config, fout, default_flow_style=False)
+
+@command_handler
+def render(args):
+    from jinja2 import Template, Environment, StrictUndefined
+    from ioutils import open_file_or_stdout
+    from collections import defaultdict
+    import yaml
+    import json
+
+    env = Environment(lstrip_blocks=True, trim_blocks=True, undefined=StrictUndefined)
+    with open(args.input_file, 'r') as f:
+        template = env.from_string(f.read())
+
+    config = yaml.load(open(args.config, 'r'))
+    config['tracks'] = dict(sorted(config['tracks'].items(), key=lambda x: x[1]['order']))
+    config['tracks_json'] = json.dumps(config['tracks'], indent=4)
+    config['options_json'] = json.dumps(config['options'], indent=4)
+    with open_file_or_stdout(args.output_file) as f:
+        f.write(template.render(**config))
+
+@command_handler
+def create_reference(args):
+    import yaml
+    import pandas as pd
+    import numpy as np
+    from pyfaidx import Fasta
+    from Bio.Seq import Seq
+    import subprocess
+
+    logger.info('read fasta')
+    fasta = Fasta(args.fasta)
+
+    logger.info('read fasta index')
+    fasta_index = pd.read_table(args.fasta + '.fai', sep='\t', header=None, dtype=str).iloc[:, [0, 1]].copy()
+    fasta_index.columns = ['name', 'length']
+    fasta_index.index = fasta_index['name']
+    fasta_index.iloc[:, 1] = fasta_index.iloc[:, 1].astype('int')
+    # rename gene ids to gene names
+    gene_names = None
+    if args.gene_names is not None:
+        logger.info('convert gene ids to gene names')
+        gene_names = pd.read_table(args.gene_names, sep='\t', header=None, dtype=str, index_col=0).iloc[:, 0]
+    gene_ids = None
+    if args.gene_ids is not None:
+        logger.info('read gene ids')
+        gene_ids = pd.read_table(args.gene_ids, dtype=str, header=None).iloc[:, 0]
+
+    if not os.path.isdir(args.output_dir):
+        os.makedirs(args.output_dir)
+    config = {}
+    config['genome'] = args.genome
+    config['reference'] = {}
+    if args.name is not None:
+        config['reference']['id'] = args.genome
+        if args.name is not None:
+            config['reference']['name'] = args.name
+        else:
+            config['reference']['name'] = args.genome
+        config['reference']['fastaURL'] = os.path.join(args.output_dir, 'reference.fa')
+        config['reference']['indexURL'] = os.path.join(args.output_dir, 'reference.fa.fai')
+        config['reference']['cytobandURL'] = os.path.join(args.output_dir, 'cytoband.txt')
+    config['tracks'] = {
+        args.genome: {
+            'name': args.genome,
+            'type': 'annotation',
+            'format': 'genepred',
+            'url': os.path.join(args.output_dir, 'annotation.genePred'),
+            'displayMode': 'EXPANDED',
+            'searchable': True,
+            'visibilityWindow':  300000000,
+            'height': 100,
+            'show': True
+        }
+    }
+    if (gene_ids is None) and (gene_names is None):
+        logger.info('copy fasta file')
+        shutil.copyfile(args.fasta, config['reference']['fastaURL'])
+        logger.info('copy fasta index')
+        shutil.copyfile(args.fasta + '.fai', config['reference']['indexURL'])
+    else:
+        logger.info('write subset of reference fasta')
+        if gene_ids is None:
+            gene_ids = list(fasta.keys())
+        with open(config['reference']['fastaURL'], 'w') as fout:
+            for gene_id in gene_ids:
+                record = fasta[gene_id]
+                if record is None:
+                    raise ValueError('gene id {} is not found in fasta file'.format(gene_id))
+                seq = record[:].seq
+                if gene_names is not None:
+                    gene_id = gene_names[gene_id]
+                fout.write('>{}\n'.format(gene_id))
+                fout.write(seq)
+                fout.write('\n')
+        logger.info('generate fasta index')
+        subprocess.check_call(['samtools', 'faidx', config['reference']['fastaURL']])
+        fasta_index = fasta_index.loc[gene_ids]
+        if gene_names is not None:
+            fasta_index['name'] = gene_names[fasta_index['name'].values]
+
+    cytoband = pd.DataFrame(index=np.arange(fasta_index.shape[0]), columns=np.arange(5))
+    cytoband.iloc[:, 0] = fasta_index['name'].values
+    cytoband.iloc[:, 1] = np.zeros(fasta_index.shape[0], dtype='int')
+    cytoband.iloc[:, 2] = fasta_index['length'].values
+    cytoband.iloc[:, 3] = ['p%d.1'%i for i in range(fasta_index.shape[0])]
+    cytoband.iloc[:, 4] = np.full(fasta_index.shape[0], 'gneg')
+    logger.info('write cytoband file')
+    cytoband.to_csv(config['reference']['cytobandURL'], sep='\t', header=False, index=False)
+
+    bed = pd.DataFrame(index=np.arange(fasta_index.shape[0]), columns=np.arange(6))
+    bed.iloc[:, 0] = fasta_index['name'].values
+    bed.iloc[:, 1] = np.zeros(fasta_index.shape[0], dtype='int')
+    bed.iloc[:, 2] = fasta_index['length'].values
+    bed.iloc[:, 3] = fasta_index['name'].values
+    bed.iloc[:, 4] = np.full(fasta_index.shape[0], '0')
+    bed.iloc[:, 5] = np.full(fasta_index.shape[0], '+')
+    logger.info('write annotation bed file')
+    bed_file = os.path.join(args.output_dir, 'annotation.bed')
+    bed.to_csv(bed_file, sep='\t', header=False, index=False)
+    logger.info('write annotation genePred file')
+    subprocess.check_call(['bedToGenePred', bed_file, config['tracks'][args.genome]['url']])
+
+    with open(os.path.join(args.output_dir, 'config.yaml'), 'w') as fout:
+        config['reference']['fastaURL'] = config['reference']['fastaURL']
+        yaml.dump(config, fout, default_flow_style=False)
+
+if __name__ == '__main__':
+    main_parser = argparse.ArgumentParser(description='Create IGV')
+    subparsers = main_parser.add_subparsers(dest='command')
+
+    parser = subparsers.add_parser('create_reference', 
+        help='Create IGV reference genome from FASTA file')
+    parser.add_argument('--genome', type=str, help='genome name', required=True)
+    parser.add_argument('--name', type=str, help='reference name')
+    parser.add_argument('--fasta', type=str, required=True)
+    parser.add_argument('--gene-names', type=str,
+        help='Tab-separated file with 2 columns: gene_id, gene_name')
+    parser.add_argument('--gene-ids', type=str,
+        help='Only use gene ids in provided list file')
+    parser.add_argument('--output-dir', '-o', type=str, required=True,
+        help='output directory')
+
+    parser = subparsers.add_parser('render', 
+        help='Render HTML from config file')
+    parser.add_argument('--input-file', '-i', type=str, required=True,
+        help='template file for IGV files')
+    parser.add_argument('--output-file', '-o', type=str, default='-',
+        help='output file')
+    parser.add_argument('--config', '-c', type=str, help='track list in YAML format')
+
+    parser = subparsers.add_parser('generate_config', 
+        help='Generate config from track files')
+    parser.add_argument('--sample-classes', type=str, required=True,
+        help='sample classes file')
+    parser.add_argument('--reference', type=str, required=True,
+        help='config file for reference genome')
+    parser.add_argument('--max-samples-per-class', type=int, default=10)
+    parser.add_argument('--track-pattern', type=str, required=True,
+        help='format string with one variables; sample_id, strand')
+    parser.add_argument('--locus', '-l', type=str, default='',
+        help='genomic locus, e.g. chr1:1000000-1000100')
+    parser.add_argument('--base-url',  type=str, default='.')
+    parser.add_argument('--extra-config', type=str)
+    parser.add_argument('--strand', '-s', type=str)
+    parser.add_argument('--output-file', '-o', type=str, required=True)
+
+    args = main_parser.parse_args()
+    if args.command is None:
+        main_parser.print_help()
+        sys.exit(1)
+    logger = logging.getLogger('create_igv.' + args.command)
+
+    try:
+        command_handlers.get(args.command)(args)
+    except BrokenPipeError:
+        pass
+    except KeyboardInterrupt:
+        pass