Diff of /step1.py [000000] .. [f8af2c]

Switch to unified view

a b/step1.py
1
"""
2
Generating useful files
3
    -- all_ALS_var.txt
4
    -- promoter1.csv
5
    -- var_in_file.json
6
"""
7
8
import os
9
import sys
10
import vcf
11
import pandas as pd
12
import numpy as np
13
import bisect
14
import json
15
import csv
16
17
def Used_variant_pos(promoters):
18
    used_variant = []
19
    for pro in promoters:
20
        for p in pro:
21
            if p not in used_variant: used_variant.append(p)
22
    return used_variant
23
24
fold_path = ''#'./chr22new/'
25
files = ['xaa.vcf']
26
# read all ALS variants position
27
all_pos = []
28
for f_str in files:
29
    file_name = fold_path+f_str
30
    with open(file_name, 'r') as f:
31
        lines = [l.replace('\n','').split("\t") for l in f if not l.startswith('##')]
32
    info = lines[2:]
33
    l = [i[1] for i in info]
34
    all_pos += l
35
print "Total number of variant pos: ", len(all_pos)
36
all_pos = map(int, all_pos)
37
all_pos_min = min(all_pos)/10000*10000
38
all_pos_max = max(all_pos)/10000*10000
39
40
# all_pos is the list of poistion of variants from ALS dataset
41
ALS_var_pos = map(int, all_pos) # variant position in ALS
42
43
# read refS dataset of this chromsome
44
csv_folder = './csv_files/'
45
refs = pd.read_csv(csv_folder+'chr22.csv', sep="\t")
46
refs = refs.drop_duplicates(subset=['txStart', 'txEnd']).sort_values('txStart')
47
48
# scale the refS dataset
49
refs_ = refs[refs.txStart>all_pos_min]
50
refs_ = refs_[refs_.txStart < all_pos_max]
51
52
temp = refs_.txStart.tolist()
53
ref_start = map(int, temp) # start posotion
54
temp = refs_.txEnd.tolist()
55
ref_end = map(int, temp) # end position
56
temp = refs_.strand.tolist()
57
strand_f = lambda x: 1 if x == "+" else 0
58
ref_strand = map(strand_f, temp) # strand
59
60
# generate promoter ddictionary
61
# {start_RefS_pos: ALS_promoter_vars}
62
promoter_dict = {}
63
sim_promoter_dict = {}
64
unique_promoter = []
65
for i in range(len(ref_start)):
66
    promoter_pos = []
67
    strand_ = ref_strand[i]
68
69
    if strand_:
70
        start_p = ref_start[i]
71
        p = bisect.bisect_left(ALS_var_pos, start_p + .5)
72
        promoter_pos = ALS_var_pos[p - 55:p] + ALS_var_pos[p:p + 9]
73
    else:
74
        start_p = ref_end[i]
75
        p = bisect.bisect_left(ALS_var_pos, start_p - .5)
76
        promoter_pos = ALS_var_pos[p - 8:p] + ALS_var_pos[p:p + 56]
77
78
    promoter_dict[start_p] = promoter_pos
79
    if promoter_pos not in unique_promoter:
80
        unique_promoter.append(promoter_pos)
81
82
print "    There are ", len(promoter_dict.values()), " genes"
83
print "    There are unique", len(unique_promoter), " genes"
84
print "   ",len(promoter_dict.values()) - len(unique_promoter), "genes are repeated"
85
86
# used ALS variants position
87
promoters = promoter_dict.values()
88
used_var = Used_variant_pos(promoters)
89
90
# generate lisit of used ALS position
91
# because there are some repeat pos
92
# this process will generate all necessary information for sequencing
93
q = []
94
for i in all_pos:
95
    if i in used_var:
96
        q.append(i)
97
98
thefile = open('all_ALS_var.txt', 'w')
99
for item in q:
100
    thefile.write("%s\n" % item)
101
102
103
# generate used variants in each .vcf file
104
all_pos_dic = {}
105
for f_str in files:
106
    l = []
107
    file_name = fold_path+f_str
108
    with open(file_name, 'r') as f:
109
        lines = [s.replace('\n','').split("\t") for s in f if not s.startswith('##')]
110
    info = lines[2:]
111
    l = [i[1] for i in info]
112
    all_pos_dic[f_str] = l
113
print "Total number of variant file: ", len(all_pos_dic)
114
115
for k,v in all_pos_dic.items():
116
    q = []
117
    for i in v:
118
        if int(i) in used_var:
119
            q.append(i)
120
    all_pos_dic[k] = q
121
122
123
with open('var_in_file.json', 'w') as fp:
124
    json.dump(all_pos_dic, fp)
125
126
# save promoter information
127
promotersx = []
128
for ps in promoters:
129
    if ps not in promotersx:
130
        promotersx.append(ps)
131
promotersx = sorted(promotersx)
132
133
134
with open("promoter1.csv", "wb") as f:
135
    writer = csv.writer(f)
136
    writer.writerows(promotersx)
137
138
print "Files generated: \n \t --var_in_file.json \n \t -- promoter1.csv \n \t -- all_ALS_var.txt"