|
a |
|
b/step2.py |
|
|
1 |
""" |
|
|
2 |
Generate individual files and chr22.json |
|
|
3 |
|
|
|
4 |
Running this code need following command: |
|
|
5 |
mkdir individual |
|
|
6 |
mkdir promoters |
|
|
7 |
|
|
|
8 |
""" |
|
|
9 |
import sys |
|
|
10 |
import os |
|
|
11 |
import vcf |
|
|
12 |
import pandas as pd |
|
|
13 |
import numpy as np |
|
|
14 |
import csv |
|
|
15 |
import time |
|
|
16 |
import json |
|
|
17 |
|
|
|
18 |
|
|
|
19 |
|
|
|
20 |
def subfinder(mylist, pattern): |
|
|
21 |
for i in range(len(mylist)): |
|
|
22 |
if mylist[i] == pattern[0] and mylist[i:i + len(pattern)] == pattern: |
|
|
23 |
return i, i + len(pattern) |
|
|
24 |
|
|
|
25 |
|
|
|
26 |
def promoter_var_idx(p_idx, promoter, var_idx): |
|
|
27 |
pmt_var = promoter[p_idx] # which promoter |
|
|
28 |
p_start, p_end = subfinder(var_idx, pmt_var) |
|
|
29 |
return p_start, p_end |
|
|
30 |
|
|
|
31 |
|
|
|
32 |
# read used ALS variants in each .vcf file |
|
|
33 |
with open('var_in_file.json') as json_data: |
|
|
34 |
var_file_dic = json.load(json_data) |
|
|
35 |
|
|
|
36 |
# read promoter table |
|
|
37 |
with open('promoter1.csv', 'rb') as f: |
|
|
38 |
reader = csv.reader(f) |
|
|
39 |
promoter = list(reader) |
|
|
40 |
|
|
|
41 |
# read used variant posotions |
|
|
42 |
with open('all_ALS_var.txt','r') as f: |
|
|
43 |
var_idx = [i.replace('\n','') for i in f] |
|
|
44 |
|
|
|
45 |
# mapping dictionary |
|
|
46 |
var_num_dict = {"0/0":'0', |
|
|
47 |
"0/1":'1', |
|
|
48 |
"1/0":'1', |
|
|
49 |
"1/1":'2', |
|
|
50 |
"./.":'-1'} |
|
|
51 |
|
|
|
52 |
# IDS: |
|
|
53 |
labels_file = 'labes.csv' |
|
|
54 |
labels_df = pd.read_csv(labels_file,index_col=0) |
|
|
55 |
ids = labels_df.FID.tolist() |
|
|
56 |
|
|
|
57 |
# create individual files |
|
|
58 |
print 'Create individual files' |
|
|
59 |
for ind in ids: |
|
|
60 |
id_file_name = 'individual/'+str(ind)+'.txt' |
|
|
61 |
id_file = open(id_file_name, 'w') |
|
|
62 |
id_file.write('') |
|
|
63 |
|
|
|
64 |
## path to vcf files |
|
|
65 |
#files = os.listdir('./chr22') |
|
|
66 |
#files.sort() |
|
|
67 |
#files = files[1:] |
|
|
68 |
files = ['xaa.vcf'] |
|
|
69 |
|
|
|
70 |
|
|
|
71 |
print 'Start writing....' |
|
|
72 |
num_vcf = len(files) |
|
|
73 |
num_vcf_batch = len(files)*0.05 |
|
|
74 |
vcf_i = 0 |
|
|
75 |
for f_str in files: |
|
|
76 |
if vcf_i % num_vcf_batch == 0: |
|
|
77 |
print vcf_i / num_vcf*100., '%....' |
|
|
78 |
|
|
|
79 |
file_name = '' + f_str |
|
|
80 |
with open(file_name, 'r') as f: |
|
|
81 |
lines = [l.replace('\n', '').split("\t") for l in f if not l.startswith('##')] |
|
|
82 |
info = lines[2:] |
|
|
83 |
# create dataframe |
|
|
84 |
v_df = pd.DataFrame.from_records(info, columns=lines[0]) |
|
|
85 |
use_less_l = ["ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] |
|
|
86 |
v_df.drop(use_less_l, axis=1, inplace=True) |
|
|
87 |
# 0/0 ==> 0 i.e. |
|
|
88 |
samples = lines[0][9:] |
|
|
89 |
replace_dict = {} |
|
|
90 |
for i in samples: |
|
|
91 |
replace_dict[i] = var_num_dict |
|
|
92 |
|
|
|
93 |
v_df_r = v_df.replace(to_replace=replace_dict) |
|
|
94 |
var_df = v_df_r[v_df_r['POS'].isin(var_file_dic[f_str])] |
|
|
95 |
# write to file |
|
|
96 |
for ind in ids: |
|
|
97 |
# the id in vcf file is 'LP6008192-DNA_E10_LP6008192-DNA_E10' for 'LP6008192-DNA_E10' |
|
|
98 |
new_ind = str(ind)+'_'+str(ind) |
|
|
99 |
if new_ind in var_df.columns.tolist()[2:]: |
|
|
100 |
info = var_df[new_ind].tolist() |
|
|
101 |
id_file_name = 'individual/' + str(ind) + '.txt' |
|
|
102 |
id_file = open(id_file_name, 'a') |
|
|
103 |
for item in info: |
|
|
104 |
id_file.write("%s\n" % item) |
|
|
105 |
id_file.close() |
|
|
106 |
else: |
|
|
107 |
print ind |
|
|
108 |
vcf_i += 1. |
|
|
109 |
|
|
|
110 |
print "100.0%... Done, generated all used ALS variants for each individual" |
|
|
111 |
|
|
|
112 |
num_pro = 0 |
|
|
113 |
print "Generating No.",num_pro," promoter in chr",22 |
|
|
114 |
|
|
|
115 |
p_start,p_end = promoter_var_idx(0,promoter,var_idx=var_idx) |
|
|
116 |
promoter_ind = {} |
|
|
117 |
for ind in ids: |
|
|
118 |
indiv_file = 'individual/'+str(ind)+'.txt' |
|
|
119 |
with open(indiv_file,'r') as f: |
|
|
120 |
ind_v = [i.replace('\n','') for i in f] |
|
|
121 |
promo = ind_v[p_start:p_end] |
|
|
122 |
promoter_ind[ind] = promo |
|
|
123 |
|
|
|
124 |
with open('promoters/chr22.json', 'w') as fp: |
|
|
125 |
json.dump(promoter_ind, fp) |
|
|
126 |
|
|
|
127 |
|