a b/promoter_generator.py
1
import sys
2
import os
3
import vcf
4
import pandas as pd
5
import numpy as np
6
import csv
7
import time
8
import json
9
10
11
12
def subfinder(mylist, pattern):
13
    for i in range(len(mylist)):
14
        if mylist[i] == pattern[0] and mylist[i:i + len(pattern)] == pattern:
15
            return i, i + len(pattern)
16
17
18
def promoter_var_idx(p_idx, promoter, var_idx):
19
    pmt_var = promoter[p_idx]  # which promoter
20
    p_start, p_end = subfinder(var_idx, pmt_var)
21
    return p_start, p_end
22
23
# IDS:
24
labels_file = 'labes.csv'
25
labels_df = pd.read_csv(labels_file,index_col=0)
26
ids = labels_df.FID.tolist()
27
# read promoter table
28
with open('promoter1.csv', 'rb') as f:
29
    reader = csv.reader(f)
30
    promoter = list(reader)
31
# read used variant posotions
32
with open('all_ALS_var.txt','r') as f:
33
    var_idx = [i.replace('\n','') for i in f]
34
35
print "number of promoters", len(promoter)
36
37
for i in range(10,100):#(len(promoter)):
38
39
    num_pro = i
40
    print "Generating No.",num_pro," promoter in chr",22
41
42
    p_start,p_end = promoter_var_idx(num_pro,promoter,var_idx=var_idx)
43
    promoter_ind = {}
44
    for ind in ids:
45
        indiv_file = 'individual/'+str(ind)+'.txt'
46
        with open(indiv_file,'r') as f:
47
            ind_v = [i.replace('\n','') for i in f]
48
        promo = ind_v[p_start:p_end]
49
        promoter_ind[ind] = promo
50
51
    with open('promoters/chr22_'+str(num_pro)+'.json', 'w') as fp:
52
        json.dump(promoter_ind, fp)