--- a +++ b/step2.py @@ -0,0 +1,127 @@ +""" +Generate individual files and chr22.json + +Running this code need following command: + mkdir individual + mkdir promoters + +""" +import sys +import os +import vcf +import pandas as pd +import numpy as np +import csv +import time +import json + + + +def subfinder(mylist, pattern): + for i in range(len(mylist)): + if mylist[i] == pattern[0] and mylist[i:i + len(pattern)] == pattern: + return i, i + len(pattern) + + +def promoter_var_idx(p_idx, promoter, var_idx): + pmt_var = promoter[p_idx] # which promoter + p_start, p_end = subfinder(var_idx, pmt_var) + return p_start, p_end + + +# read used ALS variants in each .vcf file +with open('var_in_file.json') as json_data: + var_file_dic = json.load(json_data) + +# read promoter table +with open('promoter1.csv', 'rb') as f: + reader = csv.reader(f) + promoter = list(reader) + +# read used variant posotions +with open('all_ALS_var.txt','r') as f: + var_idx = [i.replace('\n','') for i in f] + +# mapping dictionary +var_num_dict = {"0/0":'0', + "0/1":'1', + "1/0":'1', + "1/1":'2', + "./.":'-1'} + +# IDS: +labels_file = 'labes.csv' +labels_df = pd.read_csv(labels_file,index_col=0) +ids = labels_df.FID.tolist() + +# create individual files +print 'Create individual files' +for ind in ids: + id_file_name = 'individual/'+str(ind)+'.txt' + id_file = open(id_file_name, 'w') + id_file.write('') + +## path to vcf files +#files = os.listdir('./chr22') +#files.sort() +#files = files[1:] +files = ['xaa.vcf'] + + +print 'Start writing....' +num_vcf = len(files) +num_vcf_batch = len(files)*0.05 +vcf_i = 0 +for f_str in files: + if vcf_i % num_vcf_batch == 0: + print vcf_i / num_vcf*100., '%....' + + file_name = '' + f_str + with open(file_name, 'r') as f: + lines = [l.replace('\n', '').split("\t") for l in f if not l.startswith('##')] + info = lines[2:] + # create dataframe + v_df = pd.DataFrame.from_records(info, columns=lines[0]) + use_less_l = ["ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + v_df.drop(use_less_l, axis=1, inplace=True) + # 0/0 ==> 0 i.e. + samples = lines[0][9:] + replace_dict = {} + for i in samples: + replace_dict[i] = var_num_dict + + v_df_r = v_df.replace(to_replace=replace_dict) + var_df = v_df_r[v_df_r['POS'].isin(var_file_dic[f_str])] + # write to file + for ind in ids: + # the id in vcf file is 'LP6008192-DNA_E10_LP6008192-DNA_E10' for 'LP6008192-DNA_E10' + new_ind = str(ind)+'_'+str(ind) + if new_ind in var_df.columns.tolist()[2:]: + info = var_df[new_ind].tolist() + id_file_name = 'individual/' + str(ind) + '.txt' + id_file = open(id_file_name, 'a') + for item in info: + id_file.write("%s\n" % item) + id_file.close() + else: + print ind + vcf_i += 1. + +print "100.0%... Done, generated all used ALS variants for each individual" + +num_pro = 0 +print "Generating No.",num_pro," promoter in chr",22 + +p_start,p_end = promoter_var_idx(0,promoter,var_idx=var_idx) +promoter_ind = {} +for ind in ids: + indiv_file = 'individual/'+str(ind)+'.txt' + with open(indiv_file,'r') as f: + ind_v = [i.replace('\n','') for i in f] + promo = ind_v[p_start:p_end] + promoter_ind[ind] = promo + +with open('promoters/chr22.json', 'w') as fp: + json.dump(promoter_ind, fp) + +