Diff of /step2.py [000000] .. [f8af2c]

Switch to unified view

a b/step2.py
1
"""
2
Generate individual files and chr22.json
3
4
Running this code need following command:
5
    mkdir individual
6
    mkdir promoters
7
8
"""
9
import sys
10
import os
11
import vcf
12
import pandas as pd
13
import numpy as np
14
import csv
15
import time
16
import json
17
18
19
20
def subfinder(mylist, pattern):
21
    for i in range(len(mylist)):
22
        if mylist[i] == pattern[0] and mylist[i:i + len(pattern)] == pattern:
23
            return i, i + len(pattern)
24
25
26
def promoter_var_idx(p_idx, promoter, var_idx):
27
    pmt_var = promoter[p_idx]  # which promoter
28
    p_start, p_end = subfinder(var_idx, pmt_var)
29
    return p_start, p_end
30
31
32
# read used ALS variants in each .vcf file
33
with open('var_in_file.json') as json_data:
34
    var_file_dic = json.load(json_data)
35
36
# read promoter table
37
with open('promoter1.csv', 'rb') as f:
38
    reader = csv.reader(f)
39
    promoter = list(reader)
40
41
# read used variant posotions
42
with open('all_ALS_var.txt','r') as f:
43
    var_idx = [i.replace('\n','') for i in f]
44
45
# mapping dictionary
46
var_num_dict = {"0/0":'0',
47
                "0/1":'1',
48
                "1/0":'1',
49
                "1/1":'2',
50
                "./.":'-1'}
51
52
# IDS:
53
labels_file = 'labes.csv'
54
labels_df = pd.read_csv(labels_file,index_col=0)
55
ids = labels_df.FID.tolist()
56
57
# create individual files
58
print 'Create individual files'
59
for ind in ids:
60
    id_file_name = 'individual/'+str(ind)+'.txt'
61
    id_file = open(id_file_name, 'w')
62
    id_file.write('')
63
64
## path to vcf files
65
#files = os.listdir('./chr22')
66
#files.sort()
67
#files = files[1:]
68
files = ['xaa.vcf']
69
70
71
print 'Start writing....'
72
num_vcf = len(files)
73
num_vcf_batch = len(files)*0.05
74
vcf_i = 0
75
for f_str in files:
76
    if vcf_i % num_vcf_batch == 0:
77
        print vcf_i / num_vcf*100., '%....'
78
79
    file_name = '' + f_str
80
    with open(file_name, 'r') as f:
81
        lines = [l.replace('\n', '').split("\t") for l in f if not l.startswith('##')]
82
    info = lines[2:]
83
    # create dataframe
84
    v_df = pd.DataFrame.from_records(info, columns=lines[0])
85
    use_less_l = ["ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
86
    v_df.drop(use_less_l, axis=1, inplace=True)
87
    # 0/0 ==> 0 i.e.
88
    samples = lines[0][9:]
89
    replace_dict = {}
90
    for i in samples:
91
        replace_dict[i] = var_num_dict
92
93
    v_df_r = v_df.replace(to_replace=replace_dict)
94
    var_df = v_df_r[v_df_r['POS'].isin(var_file_dic[f_str])]
95
    # write to file
96
    for ind in ids:
97
        # the id in vcf file is 'LP6008192-DNA_E10_LP6008192-DNA_E10' for 'LP6008192-DNA_E10'
98
        new_ind = str(ind)+'_'+str(ind)
99
        if new_ind in var_df.columns.tolist()[2:]:
100
            info = var_df[new_ind].tolist()
101
            id_file_name = 'individual/' + str(ind) + '.txt'
102
            id_file = open(id_file_name, 'a')
103
            for item in info:
104
                id_file.write("%s\n" % item)
105
            id_file.close()
106
        else:
107
            print ind
108
    vcf_i += 1.
109
110
print "100.0%... Done, generated all used ALS variants for each individual"
111
112
num_pro = 0
113
print "Generating No.",num_pro," promoter in chr",22
114
115
p_start,p_end = promoter_var_idx(0,promoter,var_idx=var_idx)
116
promoter_ind = {}
117
for ind in ids:
118
    indiv_file = 'individual/'+str(ind)+'.txt'
119
    with open(indiv_file,'r') as f:
120
        ind_v = [i.replace('\n','') for i in f]
121
    promo = ind_v[p_start:p_end]
122
    promoter_ind[ind] = promo
123
124
with open('promoters/chr22.json', 'w') as fp:
125
    json.dump(promoter_ind, fp)
126
127