|
a |
|
b/1000genomes_dataExploration.ipynb |
|
|
1 |
{ |
|
|
2 |
"cells": [ |
|
|
3 |
{ |
|
|
4 |
"cell_type": "code", |
|
|
5 |
"execution_count": 28, |
|
|
6 |
"metadata": {}, |
|
|
7 |
"outputs": [], |
|
|
8 |
"source": [ |
|
|
9 |
"import pandas as pd\n", |
|
|
10 |
"import numpy as np\n", |
|
|
11 |
"import io\n", |
|
|
12 |
"import os\n", |
|
|
13 |
"import matplotlib.pyplot as plot\n", |
|
|
14 |
"from sklearn.preprocessing import LabelEncoder\n", |
|
|
15 |
"from sklearn.feature_selection import chi2\n", |
|
|
16 |
"from sklearn.decomposition import PCA\n", |
|
|
17 |
"from sklearn.linear_model import SGDClassifier" |
|
|
18 |
] |
|
|
19 |
}, |
|
|
20 |
{ |
|
|
21 |
"cell_type": "markdown", |
|
|
22 |
"metadata": {}, |
|
|
23 |
"source": [ |
|
|
24 |
"# Data Import and Processing\n", |
|
|
25 |
"- Haplotype data downloaded from ftp://1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/\n", |
|
|
26 |
" - Files in vcf format; see variable \"meta\" for description of the data\n", |
|
|
27 |
"- Metadata downloaded from\n", |
|
|
28 |
"http://www.internationalgenome.org/data-portal/sample\n" |
|
|
29 |
] |
|
|
30 |
}, |
|
|
31 |
{ |
|
|
32 |
"cell_type": "code", |
|
|
33 |
"execution_count": 18, |
|
|
34 |
"metadata": {}, |
|
|
35 |
"outputs": [], |
|
|
36 |
"source": [ |
|
|
37 |
"# Data Import\n", |
|
|
38 |
"directory = \"/Users/judydu/Desktop/AI4All/data/\"\n", |
|
|
39 |
"path = os.listdir(directory)\n", |
|
|
40 |
"chrom = 1\n", |
|
|
41 |
"\n", |
|
|
42 |
"origins = pd.read_csv(directory + \"igsr_samples.tsv\",\n", |
|
|
43 |
" sep = \"\\t\")\n", |
|
|
44 |
"with open(directory + path[chrom], 'r') as f:\n", |
|
|
45 |
" meta = [l for l in f if l.startswith('#')]" |
|
|
46 |
] |
|
|
47 |
}, |
|
|
48 |
{ |
|
|
49 |
"cell_type": "code", |
|
|
50 |
"execution_count": 21, |
|
|
51 |
"metadata": {}, |
|
|
52 |
"outputs": [ |
|
|
53 |
{ |
|
|
54 |
"data": { |
|
|
55 |
"text/plain": [ |
|
|
56 |
"['##fileformat=VCFv4.2\\n',\n", |
|
|
57 |
" '##ALT=<ID=CN0,Description=\"Copy number allele: 0 copies\">\\n',\n", |
|
|
58 |
" '##ALT=<ID=CN1,Description=\"Copy number allele: 1 copy\">\\n',\n", |
|
|
59 |
" '##ALT=<ID=CN10,Description=\"Copy number allele: 10 copies\">\\n',\n", |
|
|
60 |
" '##ALT=<ID=CN100,Description=\"Copy number allele: 100 copies\">\\n',\n", |
|
|
61 |
" '##ALT=<ID=CN101,Description=\"Copy number allele: 101 copies\">\\n',\n", |
|
|
62 |
" '##ALT=<ID=CN102,Description=\"Copy number allele: 102 copies\">\\n',\n", |
|
|
63 |
" '##ALT=<ID=CN103,Description=\"Copy number allele: 103 copies\">\\n',\n", |
|
|
64 |
" '##ALT=<ID=CN104,Description=\"Copy number allele: 104 copies\">\\n',\n", |
|
|
65 |
" '##ALT=<ID=CN105,Description=\"Copy number allele: 105 copies\">\\n',\n", |
|
|
66 |
" '##ALT=<ID=CN106,Description=\"Copy number allele: 106 copies\">\\n',\n", |
|
|
67 |
" '##ALT=<ID=CN107,Description=\"Copy number allele: 107 copies\">\\n',\n", |
|
|
68 |
" '##ALT=<ID=CN108,Description=\"Copy number allele: 108 copies\">\\n',\n", |
|
|
69 |
" '##ALT=<ID=CN109,Description=\"Copy number allele: 109 copies\">\\n',\n", |
|
|
70 |
" '##ALT=<ID=CN11,Description=\"Copy number allele: 11 copies\">\\n',\n", |
|
|
71 |
" '##ALT=<ID=CN110,Description=\"Copy number allele: 110 copies\">\\n',\n", |
|
|
72 |
" '##ALT=<ID=CN111,Description=\"Copy number allele: 111 copies\">\\n',\n", |
|
|
73 |
" '##ALT=<ID=CN112,Description=\"Copy number allele: 112 copies\">\\n',\n", |
|
|
74 |
" '##ALT=<ID=CN113,Description=\"Copy number allele: 113 copies\">\\n',\n", |
|
|
75 |
" '##ALT=<ID=CN114,Description=\"Copy number allele: 114 copies\">\\n',\n", |
|
|
76 |
" '##ALT=<ID=CN115,Description=\"Copy number allele: 115 copies\">\\n',\n", |
|
|
77 |
" '##ALT=<ID=CN116,Description=\"Copy number allele: 116 copies\">\\n',\n", |
|
|
78 |
" '##ALT=<ID=CN117,Description=\"Copy number allele: 117 copies\">\\n',\n", |
|
|
79 |
" '##ALT=<ID=CN118,Description=\"Copy number allele: 118 copies\">\\n',\n", |
|
|
80 |
" '##ALT=<ID=CN119,Description=\"Copy number allele: 119 copies\">\\n',\n", |
|
|
81 |
" '##ALT=<ID=CN12,Description=\"Copy number allele: 12 copies\">\\n',\n", |
|
|
82 |
" '##ALT=<ID=CN120,Description=\"Copy number allele: 120 copies\">\\n',\n", |
|
|
83 |
" '##ALT=<ID=CN121,Description=\"Copy number allele: 121 copies\">\\n',\n", |
|
|
84 |
" '##ALT=<ID=CN122,Description=\"Copy number allele: 122 copies\">\\n',\n", |
|
|
85 |
" '##ALT=<ID=CN123,Description=\"Copy number allele: 123 copies\">\\n',\n", |
|
|
86 |
" '##ALT=<ID=CN124,Description=\"Copy number allele: 124 copies\">\\n',\n", |
|
|
87 |
" '##ALT=<ID=CN13,Description=\"Copy number allele: 13 copies\">\\n',\n", |
|
|
88 |
" '##ALT=<ID=CN14,Description=\"Copy number allele: 14 copies\">\\n',\n", |
|
|
89 |
" '##ALT=<ID=CN15,Description=\"Copy number allele: 15 copies\">\\n',\n", |
|
|
90 |
" '##ALT=<ID=CN16,Description=\"Copy number allele: 16 copies\">\\n',\n", |
|
|
91 |
" '##ALT=<ID=CN17,Description=\"Copy number allele: 17 copies\">\\n',\n", |
|
|
92 |
" '##ALT=<ID=CN18,Description=\"Copy number allele: 18 copies\">\\n',\n", |
|
|
93 |
" '##ALT=<ID=CN19,Description=\"Copy number allele: 19 copies\">\\n',\n", |
|
|
94 |
" '##ALT=<ID=CN2,Description=\"Copy number allele: 2 copies\">\\n',\n", |
|
|
95 |
" '##ALT=<ID=CN20,Description=\"Copy number allele: 20 copies\">\\n',\n", |
|
|
96 |
" '##ALT=<ID=CN21,Description=\"Copy number allele: 21 copies\">\\n',\n", |
|
|
97 |
" '##ALT=<ID=CN22,Description=\"Copy number allele: 22 copies\">\\n',\n", |
|
|
98 |
" '##ALT=<ID=CN23,Description=\"Copy number allele: 23 copies\">\\n',\n", |
|
|
99 |
" '##ALT=<ID=CN24,Description=\"Copy number allele: 24 copies\">\\n',\n", |
|
|
100 |
" '##ALT=<ID=CN25,Description=\"Copy number allele: 25 copies\">\\n',\n", |
|
|
101 |
" '##ALT=<ID=CN26,Description=\"Copy number allele: 26 copies\">\\n',\n", |
|
|
102 |
" '##ALT=<ID=CN27,Description=\"Copy number allele: 27 copies\">\\n',\n", |
|
|
103 |
" '##ALT=<ID=CN28,Description=\"Copy number allele: 28 copies\">\\n',\n", |
|
|
104 |
" '##ALT=<ID=CN29,Description=\"Copy number allele: 29 copies\">\\n',\n", |
|
|
105 |
" '##ALT=<ID=CN3,Description=\"Copy number allele: 3 copies\">\\n',\n", |
|
|
106 |
" '##ALT=<ID=CN30,Description=\"Copy number allele: 30 copies\">\\n',\n", |
|
|
107 |
" '##ALT=<ID=CN31,Description=\"Copy number allele: 31 copies\">\\n',\n", |
|
|
108 |
" '##ALT=<ID=CN32,Description=\"Copy number allele: 32 copies\">\\n',\n", |
|
|
109 |
" '##ALT=<ID=CN33,Description=\"Copy number allele: 33 copies\">\\n',\n", |
|
|
110 |
" '##ALT=<ID=CN34,Description=\"Copy number allele: 34 copies\">\\n',\n", |
|
|
111 |
" '##ALT=<ID=CN35,Description=\"Copy number allele: 35 copies\">\\n',\n", |
|
|
112 |
" '##ALT=<ID=CN36,Description=\"Copy number allele: 36 copies\">\\n',\n", |
|
|
113 |
" '##ALT=<ID=CN37,Description=\"Copy number allele: 37 copies\">\\n',\n", |
|
|
114 |
" '##ALT=<ID=CN38,Description=\"Copy number allele: 38 copies\">\\n',\n", |
|
|
115 |
" '##ALT=<ID=CN39,Description=\"Copy number allele: 39 copies\">\\n',\n", |
|
|
116 |
" '##ALT=<ID=CN4,Description=\"Copy number allele: 4 copies\">\\n',\n", |
|
|
117 |
" '##ALT=<ID=CN40,Description=\"Copy number allele: 40 copies\">\\n',\n", |
|
|
118 |
" '##ALT=<ID=CN41,Description=\"Copy number allele: 41 copies\">\\n',\n", |
|
|
119 |
" '##ALT=<ID=CN42,Description=\"Copy number allele: 42 copies\">\\n',\n", |
|
|
120 |
" '##ALT=<ID=CN43,Description=\"Copy number allele: 43 copies\">\\n',\n", |
|
|
121 |
" '##ALT=<ID=CN44,Description=\"Copy number allele: 44 copies\">\\n',\n", |
|
|
122 |
" '##ALT=<ID=CN45,Description=\"Copy number allele: 45 copies\">\\n',\n", |
|
|
123 |
" '##ALT=<ID=CN46,Description=\"Copy number allele: 46 copies\">\\n',\n", |
|
|
124 |
" '##ALT=<ID=CN47,Description=\"Copy number allele: 47 copies\">\\n',\n", |
|
|
125 |
" '##ALT=<ID=CN48,Description=\"Copy number allele: 48 copies\">\\n',\n", |
|
|
126 |
" '##ALT=<ID=CN49,Description=\"Copy number allele: 49 copies\">\\n',\n", |
|
|
127 |
" '##ALT=<ID=CN5,Description=\"Copy number allele: 5 copies\">\\n',\n", |
|
|
128 |
" '##ALT=<ID=CN50,Description=\"Copy number allele: 50 copies\">\\n',\n", |
|
|
129 |
" '##ALT=<ID=CN51,Description=\"Copy number allele: 51 copies\">\\n',\n", |
|
|
130 |
" '##ALT=<ID=CN52,Description=\"Copy number allele: 52 copies\">\\n',\n", |
|
|
131 |
" '##ALT=<ID=CN53,Description=\"Copy number allele: 53 copies\">\\n',\n", |
|
|
132 |
" '##ALT=<ID=CN54,Description=\"Copy number allele: 54 copies\">\\n',\n", |
|
|
133 |
" '##ALT=<ID=CN55,Description=\"Copy number allele: 55 copies\">\\n',\n", |
|
|
134 |
" '##ALT=<ID=CN56,Description=\"Copy number allele: 56 copies\">\\n',\n", |
|
|
135 |
" '##ALT=<ID=CN57,Description=\"Copy number allele: 57 copies\">\\n',\n", |
|
|
136 |
" '##ALT=<ID=CN58,Description=\"Copy number allele: 58 copies\">\\n',\n", |
|
|
137 |
" '##ALT=<ID=CN59,Description=\"Copy number allele: 59 copies\">\\n',\n", |
|
|
138 |
" '##ALT=<ID=CN6,Description=\"Copy number allele: 6 copies\">\\n',\n", |
|
|
139 |
" '##ALT=<ID=CN60,Description=\"Copy number allele: 60 copies\">\\n',\n", |
|
|
140 |
" '##ALT=<ID=CN61,Description=\"Copy number allele: 61 copies\">\\n',\n", |
|
|
141 |
" '##ALT=<ID=CN62,Description=\"Copy number allele: 62 copies\">\\n',\n", |
|
|
142 |
" '##ALT=<ID=CN63,Description=\"Copy number allele: 63 copies\">\\n',\n", |
|
|
143 |
" '##ALT=<ID=CN64,Description=\"Copy number allele: 64 copies\">\\n',\n", |
|
|
144 |
" '##ALT=<ID=CN65,Description=\"Copy number allele: 65 copies\">\\n',\n", |
|
|
145 |
" '##ALT=<ID=CN66,Description=\"Copy number allele: 66 copies\">\\n',\n", |
|
|
146 |
" '##ALT=<ID=CN67,Description=\"Copy number allele: 67 copies\">\\n',\n", |
|
|
147 |
" '##ALT=<ID=CN68,Description=\"Copy number allele: 68 copies\">\\n',\n", |
|
|
148 |
" '##ALT=<ID=CN69,Description=\"Copy number allele: 69 copies\">\\n',\n", |
|
|
149 |
" '##ALT=<ID=CN7,Description=\"Copy number allele: 7 copies\">\\n',\n", |
|
|
150 |
" '##ALT=<ID=CN70,Description=\"Copy number allele: 70 copies\">\\n',\n", |
|
|
151 |
" '##ALT=<ID=CN71,Description=\"Copy number allele: 71 copies\">\\n',\n", |
|
|
152 |
" '##ALT=<ID=CN72,Description=\"Copy number allele: 72 copies\">\\n',\n", |
|
|
153 |
" '##ALT=<ID=CN73,Description=\"Copy number allele: 73 copies\">\\n',\n", |
|
|
154 |
" '##ALT=<ID=CN74,Description=\"Copy number allele: 74 copies\">\\n',\n", |
|
|
155 |
" '##ALT=<ID=CN75,Description=\"Copy number allele: 75 copies\">\\n',\n", |
|
|
156 |
" '##ALT=<ID=CN76,Description=\"Copy number allele: 76 copies\">\\n',\n", |
|
|
157 |
" '##ALT=<ID=CN77,Description=\"Copy number allele: 77 copies\">\\n',\n", |
|
|
158 |
" '##ALT=<ID=CN78,Description=\"Copy number allele: 78 copies\">\\n',\n", |
|
|
159 |
" '##ALT=<ID=CN79,Description=\"Copy number allele: 79 copies\">\\n',\n", |
|
|
160 |
" '##ALT=<ID=CN8,Description=\"Copy number allele: 8 copies\">\\n',\n", |
|
|
161 |
" '##ALT=<ID=CN80,Description=\"Copy number allele: 80 copies\">\\n',\n", |
|
|
162 |
" '##ALT=<ID=CN81,Description=\"Copy number allele: 81 copies\">\\n',\n", |
|
|
163 |
" '##ALT=<ID=CN82,Description=\"Copy number allele: 82 copies\">\\n',\n", |
|
|
164 |
" '##ALT=<ID=CN83,Description=\"Copy number allele: 83 copies\">\\n',\n", |
|
|
165 |
" '##ALT=<ID=CN84,Description=\"Copy number allele: 84 copies\">\\n',\n", |
|
|
166 |
" '##ALT=<ID=CN85,Description=\"Copy number allele: 85 copies\">\\n',\n", |
|
|
167 |
" '##ALT=<ID=CN86,Description=\"Copy number allele: 86 copies\">\\n',\n", |
|
|
168 |
" '##ALT=<ID=CN87,Description=\"Copy number allele: 87 copies\">\\n',\n", |
|
|
169 |
" '##ALT=<ID=CN88,Description=\"Copy number allele: 88 copies\">\\n',\n", |
|
|
170 |
" '##ALT=<ID=CN89,Description=\"Copy number allele: 89 copies\">\\n',\n", |
|
|
171 |
" '##ALT=<ID=CN9,Description=\"Copy number allele: 9 copies\">\\n',\n", |
|
|
172 |
" '##ALT=<ID=CN90,Description=\"Copy number allele: 90 copies\">\\n',\n", |
|
|
173 |
" '##ALT=<ID=CN91,Description=\"Copy number allele: 91 copies\">\\n',\n", |
|
|
174 |
" '##ALT=<ID=CN92,Description=\"Copy number allele: 92 copies\">\\n',\n", |
|
|
175 |
" '##ALT=<ID=CN93,Description=\"Copy number allele: 93 copies\">\\n',\n", |
|
|
176 |
" '##ALT=<ID=CN94,Description=\"Copy number allele: 94 copies\">\\n',\n", |
|
|
177 |
" '##ALT=<ID=CN95,Description=\"Copy number allele: 95 copies\">\\n',\n", |
|
|
178 |
" '##ALT=<ID=CN96,Description=\"Copy number allele: 96 copies\">\\n',\n", |
|
|
179 |
" '##ALT=<ID=CN97,Description=\"Copy number allele: 97 copies\">\\n',\n", |
|
|
180 |
" '##ALT=<ID=CN98,Description=\"Copy number allele: 98 copies\">\\n',\n", |
|
|
181 |
" '##ALT=<ID=CN99,Description=\"Copy number allele: 99 copies\">\\n',\n", |
|
|
182 |
" '##ALT=<ID=CNV,Description=\"Copy Number Polymorphism\">\\n',\n", |
|
|
183 |
" '##ALT=<ID=DEL,Description=\"Deletion\">\\n',\n", |
|
|
184 |
" '##ALT=<ID=DUP,Description=\"Duplication\">\\n',\n", |
|
|
185 |
" '##ALT=<ID=INS:ME:ALU,Description=\"Insertion of ALU element\">\\n',\n", |
|
|
186 |
" '##ALT=<ID=INS:ME:LINE1,Description=\"Insertion of LINE1 element\">\\n',\n", |
|
|
187 |
" '##ALT=<ID=INS:ME:SVA,Description=\"Insertion of SVA element\">\\n',\n", |
|
|
188 |
" '##ALT=<ID=INS:MT,Description=\"Nuclear Mitochondrial Insertion\">\\n',\n", |
|
|
189 |
" '##ALT=<ID=INV,Description=\"Inversion\">\\n',\n", |
|
|
190 |
" '##FILTER=<ID=PASS,Description=\"All filters passed\">\\n',\n", |
|
|
191 |
" '##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\\n',\n", |
|
|
192 |
" '##GATKCommandLine=<ID=SelectVariants,CommandLine=\"SelectVariants --output ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes_LDsubset.vcf --keep-ids gwas_sv_ld_filt_af_RSIDonly.list --variant ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf --invertSelect false --exclude-non-variants false --exclude-filtered false --preserve-alleles false --remove-unused-alternates false --restrict-alleles-to ALL --keep-original-ac false --keep-original-dp false --mendelian-violation false --invert-mendelian-violation false --mendelian-violation-qual-threshold 0.0 --select-random-fraction 0.0 --remove-fraction-genotypes 0.0 --fully-decode false --max-indel-size 2147483647 --min-indel-size 0 --max-filtered-genotypes 2147483647 --min-filtered-genotypes 0 --max-fraction-filtered-genotypes 1.0 --min-fraction-filtered-genotypes 0.0 --max-nocall-number 2147483647 --max-nocall-fraction 1.0 --set-filtered-gt-to-nocall false --allow-nonoverlapping-command-line-samples false --suppress-reference-path false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false\",Version=\"4.1.2.0\",Date=\"July 2, 2019 12:12:58 PM EDT\">\\n',\n", |
|
|
193 |
" '##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele. Format: AA|REF|ALT|IndelType. AA: Ancestral allele, REF:Reference Allele, ALT:Alternate Allele, IndelType:Type of Indel (REF, ALT and IndelType are only defined for indels)\">\\n',\n", |
|
|
194 |
" '##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes, for each ALT allele, in the same order as listed\">\\n',\n", |
|
|
195 |
" '##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">\\n',\n", |
|
|
196 |
" '##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency, for each ALT allele, in the same order as listed\">\\n',\n", |
|
|
197 |
" '##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1)\">\\n',\n", |
|
|
198 |
" '##INFO=<ID=AFR_AF,Number=A,Type=Float,Description=\"Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)\">\\n',\n", |
|
|
199 |
" '##INFO=<ID=AMR_AF,Number=A,Type=Float,Description=\"Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)\">\\n',\n", |
|
|
200 |
" '##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">\\n',\n", |
|
|
201 |
" '##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">\\n',\n", |
|
|
202 |
" '##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">\\n',\n", |
|
|
203 |
" '##INFO=<ID=CS,Number=1,Type=String,Description=\"Source call set.\">\\n',\n", |
|
|
204 |
" '##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read depth; some reads may have been filtered\">\\n',\n", |
|
|
205 |
" '##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total read depth; only low coverage data were counted towards the DP, exome data were not used\">\\n',\n", |
|
|
206 |
" '##INFO=<ID=EAS_AF,Number=A,Type=Float,Description=\"Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)\">\\n',\n", |
|
|
207 |
" '##INFO=<ID=END,Number=1,Type=Integer,Description=\"End coordinate of this variant\">\\n',\n", |
|
|
208 |
" '##INFO=<ID=EUR_AF,Number=A,Type=Float,Description=\"Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)\">\\n',\n", |
|
|
209 |
" '##INFO=<ID=EX_TARGET,Number=0,Type=Flag,Description=\"indicates whether a variant is within the exon pull down target boundaries\">\\n',\n", |
|
|
210 |
" '##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variation\">\\n',\n", |
|
|
211 |
" '##INFO=<ID=MC,Number=.,Type=String,Description=\"Merged calls.\">\\n',\n", |
|
|
212 |
" '##INFO=<ID=MEINFO,Number=4,Type=String,Description=\"Mobile element info of the form NAME,START,END<POLARITY; If there is only 5\\' OR 3\\' support for this call, will be NULL NULL for START and END\">\\n',\n", |
|
|
213 |
" '##INFO=<ID=MEND,Number=1,Type=Integer,Description=\"Mitochondrial end coordinate of inserted sequence\">\\n',\n", |
|
|
214 |
" '##INFO=<ID=MLEN,Number=1,Type=Integer,Description=\"Estimated length of mitochondrial insert\">\\n',\n", |
|
|
215 |
" '##INFO=<ID=MSTART,Number=1,Type=Integer,Description=\"Mitochondrial start coordinate of inserted sequence\">\\n',\n", |
|
|
216 |
" '##INFO=<ID=MULTI_ALLELIC,Number=0,Type=Flag,Description=\"indicates whether a site is multi-allelic\">\\n',\n", |
|
|
217 |
" '##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">\\n',\n", |
|
|
218 |
" '##INFO=<ID=SAS_AF,Number=A,Type=Float,Description=\"Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)\">\\n',\n", |
|
|
219 |
" '##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"SV length. It is only calculated for structural variation MEIs. For other types of SVs; one may calculate the SV length by INFO:END-START+1, or by finding the difference between lengthes of REF and ALT alleles\">\\n',\n", |
|
|
220 |
" '##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\\n',\n", |
|
|
221 |
" '##INFO=<ID=TSD,Number=1,Type=String,Description=\"Precise Target Site Duplication for bases, if unknown, value will be NULL\">\\n',\n", |
|
|
222 |
" '##INFO=<ID=VT,Number=.,Type=String,Description=\"indicates what type of variant the line represents\">\\n',\n", |
|
|
223 |
" '##contig=<ID=1,length=249250621>\\n',\n", |
|
|
224 |
" '##contig=<ID=2,length=243199373>\\n',\n", |
|
|
225 |
" '##contig=<ID=3,length=198022430>\\n',\n", |
|
|
226 |
" '##contig=<ID=4,length=191154276>\\n',\n", |
|
|
227 |
" '##contig=<ID=5,length=180915260>\\n',\n", |
|
|
228 |
" '##contig=<ID=6,length=171115067>\\n',\n", |
|
|
229 |
" '##contig=<ID=7,length=159138663>\\n',\n", |
|
|
230 |
" '##contig=<ID=8,length=146364022>\\n',\n", |
|
|
231 |
" '##contig=<ID=9,length=141213431>\\n',\n", |
|
|
232 |
" '##contig=<ID=10,length=135534747>\\n',\n", |
|
|
233 |
" '##contig=<ID=11,length=135006516>\\n',\n", |
|
|
234 |
" '##contig=<ID=12,length=133851895>\\n',\n", |
|
|
235 |
" '##contig=<ID=13,length=115169878>\\n',\n", |
|
|
236 |
" '##contig=<ID=14,length=107349540>\\n',\n", |
|
|
237 |
" '##contig=<ID=15,length=102531392>\\n',\n", |
|
|
238 |
" '##contig=<ID=16,length=90354753>\\n',\n", |
|
|
239 |
" '##contig=<ID=17,length=81195210>\\n',\n", |
|
|
240 |
" '##contig=<ID=18,length=78077248>\\n',\n", |
|
|
241 |
" '##contig=<ID=19,length=59128983>\\n',\n", |
|
|
242 |
" '##contig=<ID=20,length=63025520>\\n',\n", |
|
|
243 |
" '##contig=<ID=21,length=48129895>\\n',\n", |
|
|
244 |
" '##contig=<ID=22,length=51304566>\\n',\n", |
|
|
245 |
" '##contig=<ID=GL000191.1,length=106433>\\n',\n", |
|
|
246 |
" '##contig=<ID=GL000192.1,length=547496>\\n',\n", |
|
|
247 |
" '##contig=<ID=GL000193.1,length=189789>\\n',\n", |
|
|
248 |
" '##contig=<ID=GL000194.1,length=191469>\\n',\n", |
|
|
249 |
" '##contig=<ID=GL000195.1,length=182896>\\n',\n", |
|
|
250 |
" '##contig=<ID=GL000196.1,length=38914>\\n',\n", |
|
|
251 |
" '##contig=<ID=GL000197.1,length=37175>\\n',\n", |
|
|
252 |
" '##contig=<ID=GL000198.1,length=90085>\\n',\n", |
|
|
253 |
" '##contig=<ID=GL000199.1,length=169874>\\n',\n", |
|
|
254 |
" '##contig=<ID=GL000200.1,length=187035>\\n',\n", |
|
|
255 |
" '##contig=<ID=GL000201.1,length=36148>\\n',\n", |
|
|
256 |
" '##contig=<ID=GL000202.1,length=40103>\\n',\n", |
|
|
257 |
" '##contig=<ID=GL000203.1,length=37498>\\n',\n", |
|
|
258 |
" '##contig=<ID=GL000204.1,length=81310>\\n',\n", |
|
|
259 |
" '##contig=<ID=GL000205.1,length=174588>\\n',\n", |
|
|
260 |
" '##contig=<ID=GL000206.1,length=41001>\\n',\n", |
|
|
261 |
" '##contig=<ID=GL000207.1,length=4262>\\n',\n", |
|
|
262 |
" '##contig=<ID=GL000208.1,length=92689>\\n',\n", |
|
|
263 |
" '##contig=<ID=GL000209.1,length=159169>\\n',\n", |
|
|
264 |
" '##contig=<ID=GL000210.1,length=27682>\\n',\n", |
|
|
265 |
" '##contig=<ID=GL000211.1,length=166566>\\n',\n", |
|
|
266 |
" '##contig=<ID=GL000212.1,length=186858>\\n',\n", |
|
|
267 |
" '##contig=<ID=GL000213.1,length=164239>\\n',\n", |
|
|
268 |
" '##contig=<ID=GL000214.1,length=137718>\\n',\n", |
|
|
269 |
" '##contig=<ID=GL000215.1,length=172545>\\n',\n", |
|
|
270 |
" '##contig=<ID=GL000216.1,length=172294>\\n',\n", |
|
|
271 |
" '##contig=<ID=GL000217.1,length=172149>\\n',\n", |
|
|
272 |
" '##contig=<ID=GL000218.1,length=161147>\\n',\n", |
|
|
273 |
" '##contig=<ID=GL000219.1,length=179198>\\n',\n", |
|
|
274 |
" '##contig=<ID=GL000220.1,length=161802>\\n',\n", |
|
|
275 |
" '##contig=<ID=GL000221.1,length=155397>\\n',\n", |
|
|
276 |
" '##contig=<ID=GL000222.1,length=186861>\\n',\n", |
|
|
277 |
" '##contig=<ID=GL000223.1,length=180455>\\n',\n", |
|
|
278 |
" '##contig=<ID=GL000224.1,length=179693>\\n',\n", |
|
|
279 |
" '##contig=<ID=GL000225.1,length=211173>\\n',\n", |
|
|
280 |
" '##contig=<ID=GL000226.1,length=15008>\\n',\n", |
|
|
281 |
" '##contig=<ID=GL000227.1,length=128374>\\n',\n", |
|
|
282 |
" '##contig=<ID=GL000228.1,length=129120>\\n',\n", |
|
|
283 |
" '##contig=<ID=GL000229.1,length=19913>\\n',\n", |
|
|
284 |
" '##contig=<ID=GL000230.1,length=43691>\\n',\n", |
|
|
285 |
" '##contig=<ID=GL000231.1,length=27386>\\n',\n", |
|
|
286 |
" '##contig=<ID=GL000232.1,length=40652>\\n',\n", |
|
|
287 |
" '##contig=<ID=GL000233.1,length=45941>\\n',\n", |
|
|
288 |
" '##contig=<ID=GL000234.1,length=40531>\\n',\n", |
|
|
289 |
" '##contig=<ID=GL000235.1,length=34474>\\n',\n", |
|
|
290 |
" '##contig=<ID=GL000236.1,length=41934>\\n',\n", |
|
|
291 |
" '##contig=<ID=GL000237.1,length=45867>\\n',\n", |
|
|
292 |
" '##contig=<ID=GL000238.1,length=39939>\\n',\n", |
|
|
293 |
" '##contig=<ID=GL000239.1,length=33824>\\n',\n", |
|
|
294 |
" '##contig=<ID=GL000240.1,length=41933>\\n',\n", |
|
|
295 |
" '##contig=<ID=GL000241.1,length=42152>\\n',\n", |
|
|
296 |
" '##contig=<ID=GL000242.1,length=43523>\\n',\n", |
|
|
297 |
" '##contig=<ID=GL000243.1,length=43341>\\n',\n", |
|
|
298 |
" '##contig=<ID=GL000244.1,length=39929>\\n',\n", |
|
|
299 |
" '##contig=<ID=GL000245.1,length=36651>\\n',\n", |
|
|
300 |
" '##contig=<ID=GL000246.1,length=38154>\\n',\n", |
|
|
301 |
" '##contig=<ID=GL000247.1,length=36422>\\n',\n", |
|
|
302 |
" '##contig=<ID=GL000248.1,length=39786>\\n',\n", |
|
|
303 |
" '##contig=<ID=GL000249.1,length=38502>\\n',\n", |
|
|
304 |
" '##contig=<ID=MT,length=16569>\\n',\n", |
|
|
305 |
" '##contig=<ID=NC_007605,length=171823>\\n',\n", |
|
|
306 |
" '##contig=<ID=X,length=155270560>\\n',\n", |
|
|
307 |
" '##contig=<ID=Y,length=59373566>\\n',\n", |
|
|
308 |
" '##contig=<ID=hs37d5,length=35477943>\\n',\n", |
|
|
309 |
" '##fileDate=20150218\\n',\n", |
|
|
310 |
" '##source=1000GenomesPhase3Pipeline\\n',\n", |
|
|
311 |
" '##source=SelectVariants\\n',\n", |
|
|
312 |
" '#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tHG00096\\tHG00097\\tHG00099\\tHG00100\\tHG00101\\tHG00102\\tHG00103\\tHG00105\\tHG00106\\tHG00107\\tHG00108\\tHG00109\\tHG00110\\tHG00111\\tHG00112\\tHG00113\\tHG00114\\tHG00115\\tHG00116\\tHG00117\\tHG00118\\tHG00119\\tHG00120\\tHG00121\\tHG00122\\tHG00123\\tHG00125\\tHG00126\\tHG00127\\tHG00128\\tHG00129\\tHG00130\\tHG00131\\tHG00132\\tHG00133\\tHG00136\\tHG00137\\tHG00138\\tHG00139\\tHG00140\\tHG00141\\tHG00142\\tHG00143\\tHG00145\\tHG00146\\tHG00148\\tHG00149\\tHG00150\\tHG00151\\tHG00154\\tHG00155\\tHG00157\\tHG00158\\tHG00159\\tHG00160\\tHG00171\\tHG00173\\tHG00174\\tHG00176\\tHG00177\\tHG00178\\tHG00179\\tHG00180\\tHG00181\\tHG00182\\tHG00183\\tHG00185\\tHG00186\\tHG00187\\tHG00188\\tHG00189\\tHG00190\\tHG00231\\tHG00232\\tHG00233\\tHG00234\\tHG00235\\tHG00236\\tHG00237\\tHG00238\\tHG00239\\tHG00240\\tHG00242\\tHG00243\\tHG00244\\tHG00245\\tHG00246\\tHG00250\\tHG00251\\tHG00252\\tHG00253\\tHG00254\\tHG00255\\tHG00256\\tHG00257\\tHG00258\\tHG00259\\tHG00260\\tHG00261\\tHG00262\\tHG00263\\tHG00264\\tHG00265\\tHG00266\\tHG00267\\tHG00268\\tHG00269\\tHG00271\\tHG00272\\tHG00273\\tHG00274\\tHG00275\\tHG00276\\tHG00277\\tHG00278\\tHG00280\\tHG00281\\tHG00282\\tHG00284\\tHG00285\\tHG00288\\tHG00290\\tHG00304\\tHG00306\\tHG00308\\tHG00309\\tHG00310\\tHG00311\\tHG00313\\tHG00315\\tHG00318\\tHG00319\\tHG00320\\tHG00321\\tHG00323\\tHG00324\\tHG00325\\tHG00326\\tHG00327\\tHG00328\\tHG00329\\tHG00330\\tHG00331\\tHG00332\\tHG00334\\tHG00335\\tHG00336\\tHG00337\\tHG00338\\tHG00339\\tHG00341\\tHG00342\\tHG00343\\tHG00344\\tHG00345\\tHG00346\\tHG00349\\tHG00350\\tHG00351\\tHG00353\\tHG00355\\tHG00356\\tHG00357\\tHG00358\\tHG00360\\tHG00361\\tHG00362\\tHG00364\\tHG00365\\tHG00366\\tHG00367\\tHG00368\\tHG00369\\tHG00371\\tHG00372\\tHG00373\\tHG00375\\tHG00376\\tHG00378\\tHG00379\\tHG00380\\tHG00381\\tHG00382\\tHG00383\\tHG00384\\tHG00403\\tHG00404\\tHG00406\\tHG00407\\tHG00409\\tHG00410\\tHG00419\\tHG00421\\tHG00422\\tHG00428\\tHG00436\\tHG00437\\tHG00442\\tHG00443\\tHG00445\\tHG00446\\tHG00448\\tHG00449\\tHG00451\\tHG00452\\tHG00457\\tHG00458\\tHG00463\\tHG00464\\tHG00472\\tHG00473\\tHG00475\\tHG00476\\tHG00478\\tHG00479\\tHG00500\\tHG00513\\tHG00524\\tHG00525\\tHG00530\\tHG00531\\tHG00533\\tHG00534\\tHG00536\\tHG00537\\tHG00542\\tHG00543\\tHG00551\\tHG00553\\tHG00554\\tHG00556\\tHG00557\\tHG00559\\tHG00560\\tHG00565\\tHG00566\\tHG00580\\tHG00581\\tHG00583\\tHG00584\\tHG00589\\tHG00590\\tHG00592\\tHG00593\\tHG00595\\tHG00596\\tHG00598\\tHG00599\\tHG00607\\tHG00608\\tHG00610\\tHG00611\\tHG00613\\tHG00614\\tHG00619\\tHG00620\\tHG00622\\tHG00623\\tHG00625\\tHG00626\\tHG00628\\tHG00629\\tHG00631\\tHG00632\\tHG00634\\tHG00637\\tHG00638\\tHG00640\\tHG00641\\tHG00650\\tHG00651\\tHG00653\\tHG00654\\tHG00656\\tHG00657\\tHG00662\\tHG00663\\tHG00671\\tHG00672\\tHG00674\\tHG00675\\tHG00683\\tHG00684\\tHG00689\\tHG00690\\tHG00692\\tHG00693\\tHG00698\\tHG00699\\tHG00701\\tHG00704\\tHG00705\\tHG00707\\tHG00708\\tHG00717\\tHG00728\\tHG00729\\tHG00731\\tHG00732\\tHG00734\\tHG00736\\tHG00737\\tHG00739\\tHG00740\\tHG00742\\tHG00743\\tHG00759\\tHG00766\\tHG00844\\tHG00851\\tHG00864\\tHG00867\\tHG00879\\tHG00881\\tHG00956\\tHG00978\\tHG00982\\tHG01028\\tHG01029\\tHG01031\\tHG01046\\tHG01047\\tHG01048\\tHG01049\\tHG01051\\tHG01052\\tHG01054\\tHG01055\\tHG01058\\tHG01060\\tHG01061\\tHG01063\\tHG01064\\tHG01066\\tHG01067\\tHG01069\\tHG01070\\tHG01072\\tHG01073\\tHG01075\\tHG01077\\tHG01079\\tHG01080\\tHG01082\\tHG01083\\tHG01085\\tHG01086\\tHG01088\\tHG01089\\tHG01092\\tHG01094\\tHG01095\\tHG01097\\tHG01098\\tHG01101\\tHG01102\\tHG01104\\tHG01105\\tHG01107\\tHG01108\\tHG01110\\tHG01111\\tHG01112\\tHG01113\\tHG01119\\tHG01121\\tHG01122\\tHG01124\\tHG01125\\tHG01130\\tHG01131\\tHG01133\\tHG01134\\tHG01136\\tHG01137\\tHG01139\\tHG01140\\tHG01142\\tHG01148\\tHG01149\\tHG01161\\tHG01162\\tHG01164\\tHG01167\\tHG01168\\tHG01170\\tHG01171\\tHG01173\\tHG01174\\tHG01176\\tHG01177\\tHG01182\\tHG01183\\tHG01187\\tHG01188\\tHG01190\\tHG01191\\tHG01197\\tHG01198\\tHG01200\\tHG01204\\tHG01205\\tHG01241\\tHG01242\\tHG01247\\tHG01248\\tHG01250\\tHG01251\\tHG01253\\tHG01254\\tHG01256\\tHG01257\\tHG01259\\tHG01260\\tHG01269\\tHG01271\\tHG01272\\tHG01275\\tHG01277\\tHG01280\\tHG01281\\tHG01284\\tHG01286\\tHG01302\\tHG01303\\tHG01305\\tHG01308\\tHG01311\\tHG01312\\tHG01323\\tHG01325\\tHG01326\\tHG01334\\tHG01341\\tHG01342\\tHG01344\\tHG01345\\tHG01348\\tHG01350\\tHG01351\\tHG01353\\tHG01354\\tHG01356\\tHG01357\\tHG01359\\tHG01360\\tHG01362\\tHG01363\\tHG01365\\tHG01366\\tHG01369\\tHG01372\\tHG01374\\tHG01375\\tHG01377\\tHG01378\\tHG01383\\tHG01384\\tHG01389\\tHG01390\\tHG01392\\tHG01393\\tHG01395\\tHG01396\\tHG01398\\tHG01402\\tHG01403\\tHG01405\\tHG01412\\tHG01413\\tHG01414\\tHG01431\\tHG01432\\tHG01435\\tHG01437\\tHG01438\\tHG01440\\tHG01441\\tHG01443\\tHG01444\\tHG01447\\tHG01455\\tHG01456\\tHG01459\\tHG01461\\tHG01462\\tHG01464\\tHG01465\\tHG01468\\tHG01474\\tHG01479\\tHG01485\\tHG01486\\tHG01488\\tHG01489\\tHG01491\\tHG01492\\tHG01494\\tHG01495\\tHG01497\\tHG01498\\tHG01500\\tHG01501\\tHG01503\\tHG01504\\tHG01506\\tHG01507\\tHG01509\\tHG01510\\tHG01512\\tHG01513\\tHG01515\\tHG01516\\tHG01518\\tHG01519\\tHG01521\\tHG01522\\tHG01524\\tHG01525\\tHG01527\\tHG01528\\tHG01530\\tHG01531\\tHG01536\\tHG01537\\tHG01550\\tHG01551\\tHG01556\\tHG01565\\tHG01566\\tHG01571\\tHG01572\\tHG01577\\tHG01578\\tHG01583\\tHG01586\\tHG01589\\tHG01593\\tHG01595\\tHG01596\\tHG01597\\tHG01598\\tHG01599\\tHG01600\\tHG01602\\tHG01603\\tHG01605\\tHG01606\\tHG01607\\tHG01608\\tHG01610\\tHG01612\\tHG01613\\tHG01615\\tHG01617\\tHG01618\\tHG01619\\tHG01620\\tHG01623\\tHG01624\\tHG01625\\tHG01626\\tHG01628\\tHG01630\\tHG01631\\tHG01632\\tHG01668\\tHG01669\\tHG01670\\tHG01672\\tHG01673\\tHG01675\\tHG01676\\tHG01678\\tHG01679\\tHG01680\\tHG01682\\tHG01684\\tHG01685\\tHG01686\\tHG01694\\tHG01695\\tHG01697\\tHG01699\\tHG01700\\tHG01702\\tHG01704\\tHG01705\\tHG01707\\tHG01708\\tHG01709\\tHG01710\\tHG01746\\tHG01747\\tHG01756\\tHG01757\\tHG01761\\tHG01762\\tHG01765\\tHG01766\\tHG01767\\tHG01768\\tHG01770\\tHG01771\\tHG01773\\tHG01775\\tHG01776\\tHG01777\\tHG01779\\tHG01781\\tHG01783\\tHG01784\\tHG01785\\tHG01786\\tHG01789\\tHG01790\\tHG01791\\tHG01794\\tHG01795\\tHG01796\\tHG01797\\tHG01798\\tHG01799\\tHG01800\\tHG01801\\tHG01802\\tHG01804\\tHG01805\\tHG01806\\tHG01807\\tHG01808\\tHG01809\\tHG01810\\tHG01811\\tHG01812\\tHG01813\\tHG01815\\tHG01816\\tHG01817\\tHG01840\\tHG01841\\tHG01842\\tHG01843\\tHG01844\\tHG01845\\tHG01846\\tHG01847\\tHG01848\\tHG01849\\tHG01850\\tHG01851\\tHG01852\\tHG01853\\tHG01855\\tHG01857\\tHG01858\\tHG01859\\tHG01860\\tHG01861\\tHG01862\\tHG01863\\tHG01864\\tHG01865\\tHG01866\\tHG01867\\tHG01868\\tHG01869\\tHG01870\\tHG01871\\tHG01872\\tHG01873\\tHG01874\\tHG01878\\tHG01879\\tHG01880\\tHG01882\\tHG01883\\tHG01885\\tHG01886\\tHG01889\\tHG01890\\tHG01892\\tHG01893\\tHG01894\\tHG01896\\tHG01912\\tHG01914\\tHG01915\\tHG01917\\tHG01918\\tHG01920\\tHG01921\\tHG01923\\tHG01924\\tHG01926\\tHG01927\\tHG01932\\tHG01933\\tHG01935\\tHG01936\\tHG01938\\tHG01939\\tHG01941\\tHG01942\\tHG01944\\tHG01945\\tHG01947\\tHG01948\\tHG01950\\tHG01951\\tHG01953\\tHG01954\\tHG01956\\tHG01958\\tHG01961\\tHG01965\\tHG01967\\tHG01968\\tHG01970\\tHG01971\\tHG01973\\tHG01974\\tHG01976\\tHG01977\\tHG01979\\tHG01980\\tHG01982\\tHG01985\\tHG01986\\tHG01988\\tHG01989\\tHG01990\\tHG01991\\tHG01992\\tHG01997\\tHG02002\\tHG02003\\tHG02006\\tHG02008\\tHG02009\\tHG02010\\tHG02012\\tHG02013\\tHG02014\\tHG02016\\tHG02017\\tHG02019\\tHG02020\\tHG02023\\tHG02025\\tHG02026\\tHG02028\\tHG02029\\tHG02031\\tHG02032\\tHG02035\\tHG02040\\tHG02047\\tHG02048\\tHG02049\\tHG02050\\tHG02051\\tHG02052\\tHG02053\\tHG02054\\tHG02057\\tHG02058\\tHG02060\\tHG02061\\tHG02064\\tHG02067\\tHG02069\\tHG02070\\tHG02072\\tHG02073\\tHG02075\\tHG02076\\tHG02078\\tHG02079\\tHG02081\\tHG02082\\tHG02084\\tHG02085\\tHG02086\\tHG02087\\tHG02088\\tHG02089\\tHG02090\\tHG02095\\tHG02102\\tHG02104\\tHG02105\\tHG02107\\tHG02108\\tHG02111\\tHG02113\\tHG02116\\tHG02121\\tHG02122\\tHG02127\\tHG02128\\tHG02130\\tHG02131\\tHG02133\\tHG02134\\tHG02136\\tHG02137\\tHG02138\\tHG02139\\tHG02140\\tHG02141\\tHG02142\\tHG02143\\tHG02144\\tHG02146\\tHG02147\\tHG02150\\tHG02151\\tHG02152\\tHG02153\\tHG02154\\tHG02155\\tHG02156\\tHG02164\\tHG02165\\tHG02166\\tHG02178\\tHG02179\\tHG02180\\tHG02181\\tHG02182\\tHG02184\\tHG02185\\tHG02186\\tHG02187\\tHG02188\\tHG02190\\tHG02215\\tHG02219\\tHG02220\\tHG02221\\tHG02223\\tHG02224\\tHG02230\\tHG02231\\tHG02232\\tHG02233\\tHG02235\\tHG02236\\tHG02238\\tHG02239\\tHG02250\\tHG02252\\tHG02253\\tHG02255\\tHG02256\\tHG02259\\tHG02260\\tHG02262\\tHG02265\\tHG02266\\tHG02271\\tHG02272\\tHG02274\\tHG02275\\tHG02277\\tHG02278\\tHG02281\\tHG02282\\tHG02283\\tHG02284\\tHG02285\\tHG02286\\tHG02291\\tHG02292\\tHG02298\\tHG02299\\tHG02301\\tHG02304\\tHG02307\\tHG02308\\tHG02309\\tHG02312\\tHG02314\\tHG02315\\tHG02317\\tHG02318\\tHG02322\\tHG02323\\tHG02325\\tHG02330\\tHG02332\\tHG02334\\tHG02337\\tHG02339\\tHG02343\\tHG02345\\tHG02348\\tHG02351\\tHG02353\\tHG02355\\tHG02356\\tHG02360\\tHG02364\\tHG02367\\tHG02371\\tHG02373\\tHG02374\\tHG02375\\tHG02379\\tHG02380\\tHG02382\\tHG02383\\tHG02384\\tHG02385\\tHG02386\\tHG02389\\tHG02390\\tHG02391\\tHG02392\\tHG02394\\tHG02395\\tHG02396\\tHG02397\\tHG02398\\tHG02399\\tHG02401\\tHG02402\\tHG02406\\tHG02407\\tHG02408\\tHG02409\\tHG02410\\tHG02419\\tHG02420\\tHG02425\\tHG02427\\tHG02429\\tHG02433\\tHG02439\\tHG02442\\tHG02445\\tHG02449\\tHG02450\\tHG02455\\tHG02461\\tHG02462\\tHG02464\\tHG02465\\tHG02470\\tHG02471\\tHG02476\\tHG02477\\tHG02479\\tHG02481\\tHG02484\\tHG02485\\tHG02489\\tHG02490\\tHG02491\\tHG02493\\tHG02494\\tHG02496\\tHG02497\\tHG02501\\tHG02502\\tHG02505\\tHG02508\\tHG02511\\tHG02512\\tHG02513\\tHG02521\\tHG02522\\tHG02536\\tHG02537\\tHG02541\\tHG02545\\tHG02546\\tHG02549\\tHG02554\\tHG02555\\tHG02557\\tHG02558\\tHG02561\\tHG02562\\tHG02568\\tHG02570\\tHG02571\\tHG02573\\tHG02574\\tHG02577\\tHG02580\\tHG02582\\tHG02583\\tHG02585\\tHG02586\\tHG02588\\tHG02589\\tHG02594\\tHG02595\\tHG02597\\tHG02600\\tHG02601\\tHG02603\\tHG02604\\tHG02610\\tHG02611\\tHG02613\\tHG02614\\tHG02620\\tHG02621\\tHG02623\\tHG02624\\tHG02628\\tHG02629\\tHG02634\\tHG02635\\tHG02642\\tHG02643\\tHG02645\\tHG02646\\tHG02648\\tHG02649\\tHG02651\\tHG02652\\tHG02654\\tHG02655\\tHG02657\\tHG02658\\tHG02660\\tHG02661\\tHG02666\\tHG02667\\tHG02675\\tHG02676\\tHG02678\\tHG02679\\tHG02681\\tHG02682\\tHG02684\\tHG02685\\tHG02687\\tHG02688\\tHG02690\\tHG02691\\tHG02694\\tHG02696\\tHG02697\\tHG02699\\tHG02700\\tHG02702\\tHG02703\\tHG02715\\tHG02716\\tHG02721\\tHG02722\\tHG02724\\tHG02725\\tHG02727\\tHG02728\\tHG02731\\tHG02733\\tHG02734\\tHG02736\\tHG02737\\tHG02756\\tHG02757\\tHG02759\\tHG02760\\tHG02763\\tHG02768\\tHG02769\\tHG02771\\tHG02772\\tHG02774\\tHG02775\\tHG02778\\tHG02780\\tHG02783\\tHG02784\\tHG02786\\tHG02787\\tHG02789\\tHG02790\\tHG02792\\tHG02793\\tHG02798\\tHG02799\\tHG02804\\tHG02805\\tHG02807\\tHG02808\\tHG02810\\tHG02811\\tHG02813\\tHG02814\\tHG02816\\tHG02817\\tHG02819\\tHG02820\\tHG02836\\tHG02837\\tHG02839\\tHG02840\\tHG02851\\tHG02852\\tHG02854\\tHG02855\\tHG02860\\tHG02861\\tHG02870\\tHG02878\\tHG02879\\tHG02881\\tHG02882\\tHG02884\\tHG02885\\tHG02887\\tHG02888\\tHG02890\\tHG02891\\tHG02895\\tHG02896\\tHG02922\\tHG02923\\tHG02938\\tHG02941\\tHG02943\\tHG02944\\tHG02946\\tHG02947\\tHG02952\\tHG02953\\tHG02968\\tHG02970\\tHG02971\\tHG02973\\tHG02974\\tHG02976\\tHG02977\\tHG02979\\tHG02981\\tHG02982\\tHG02983\\tHG03006\\tHG03007\\tHG03009\\tHG03012\\tHG03015\\tHG03016\\tHG03018\\tHG03019\\tHG03021\\tHG03022\\tHG03024\\tHG03025\\tHG03027\\tHG03028\\tHG03039\\tHG03040\\tHG03045\\tHG03046\\tHG03048\\tHG03049\\tHG03052\\tHG03054\\tHG03055\\tHG03057\\tHG03058\\tHG03060\\tHG03061\\tHG03063\\tHG03064\\tHG03066\\tHG03069\\tHG03072\\tHG03073\\tHG03074\\tHG03077\\tHG03078\\tHG03079\\tHG03081\\tHG03082\\tHG03084\\tHG03085\\tHG03086\\tHG03088\\tHG03091\\tHG03095\\tHG03096\\tHG03097\\tHG03099\\tHG03100\\tHG03103\\tHG03105\\tHG03108\\tHG03109\\tHG03111\\tHG03112\\tHG03114\\tHG03115\\tHG03117\\tHG03118\\tHG03120\\tHG03121\\tHG03123\\tHG03124\\tHG03126\\tHG03127\\tHG03129\\tHG03130\\tHG03132\\tHG03133\\tHG03135\\tHG03136\\tHG03139\\tHG03157\\tHG03159\\tHG03160\\tHG03162\\tHG03163\\tHG03166\\tHG03168\\tHG03169\\tHG03172\\tHG03175\\tHG03189\\tHG03190\\tHG03193\\tHG03195\\tHG03196\\tHG03198\\tHG03199\\tHG03202\\tHG03209\\tHG03212\\tHG03224\\tHG03225\\tHG03228\\tHG03229\\tHG03234\\tHG03235\\tHG03237\\tHG03238\\tHG03240\\tHG03241\\tHG03246\\tHG03247\\tHG03258\\tHG03259\\tHG03265\\tHG03267\\tHG03268\\tHG03270\\tHG03271\\tHG03279\\tHG03280\\tHG03291\\tHG03294\\tHG03295\\tHG03297\\tHG03298\\tHG03300\\tHG03301\\tHG03303\\tHG03304\\tHG03311\\tHG03313\\tHG03342\\tHG03343\\tHG03351\\tHG03352\\tHG03354\\tHG03363\\tHG03366\\tHG03367\\tHG03369\\tHG03370\\tHG03372\\tHG03376\\tHG03378\\tHG03380\\tHG03382\\tHG03385\\tHG03388\\tHG03391\\tHG03394\\tHG03397\\tHG03401\\tHG03410\\tHG03419\\tHG03428\\tHG03432\\tHG03433\\tHG03436\\tHG03437\\tHG03439\\tHG03442\\tHG03445\\tHG03446\\tHG03449\\tHG03451\\tHG03452\\tHG03455\\tHG03457\\tHG03458\\tHG03460\\tHG03461\\tHG03464\\tHG03469\\tHG03470\\tHG03472\\tHG03473\\tHG03476\\tHG03478\\tHG03479\\tHG03484\\tHG03485\\tHG03488\\tHG03490\\tHG03491\\tHG03499\\tHG03511\\tHG03514\\tHG03515\\tHG03517\\tHG03518\\tHG03520\\tHG03521\\tHG03538\\tHG03539\\tHG03547\\tHG03548\\tHG03556\\tHG03557\\tHG03558\\tHG03559\\tHG03563\\tHG03565\\tHG03567\\tHG03571\\tHG03572\\tHG03575\\tHG03577\\tHG03578\\tHG03583\\tHG03585\\tHG03589\\tHG03593\\tHG03594\\tHG03595\\tHG03598\\tHG03600\\tHG03603\\tHG03604\\tHG03607\\tHG03611\\tHG03615\\tHG03616\\tHG03619\\tHG03624\\tHG03625\\tHG03629\\tHG03631\\tHG03634\\tHG03636\\tHG03640\\tHG03642\\tHG03643\\tHG03644\\tHG03645\\tHG03646\\tHG03649\\tHG03652\\tHG03653\\tHG03660\\tHG03663\\tHG03667\\tHG03668\\tHG03672\\tHG03673\\tHG03679\\tHG03680\\tHG03681\\tHG03684\\tHG03685\\tHG03686\\tHG03687\\tHG03689\\tHG03690\\tHG03691\\tHG03692\\tHG03693\\tHG03694\\tHG03695\\tHG03696\\tHG03697\\tHG03698\\tHG03702\\tHG03703\\tHG03705\\tHG03706\\tHG03708\\tHG03709\\tHG03711\\tHG03713\\tHG03714\\tHG03716\\tHG03717\\tHG03718\\tHG03720\\tHG03722\\tHG03727\\tHG03729\\tHG03730\\tHG03731\\tHG03733\\tHG03736\\tHG03738\\tHG03740\\tHG03741\\tHG03742\\tHG03743\\tHG03744\\tHG03745\\tHG03746\\tHG03750\\tHG03752\\tHG03753\\tHG03754\\tHG03755\\tHG03756\\tHG03757\\tHG03760\\tHG03762\\tHG03765\\tHG03767\\tHG03770\\tHG03771\\tHG03772\\tHG03773\\tHG03774\\tHG03775\\tHG03777\\tHG03778\\tHG03779\\tHG03780\\tHG03781\\tHG03782\\tHG03784\\tHG03785\\tHG03786\\tHG03787\\tHG03788\\tHG03789\\tHG03790\\tHG03792\\tHG03793\\tHG03796\\tHG03800\\tHG03802\\tHG03803\\tHG03805\\tHG03808\\tHG03809\\tHG03812\\tHG03814\\tHG03815\\tHG03817\\tHG03821\\tHG03823\\tHG03824\\tHG03826\\tHG03829\\tHG03830\\tHG03832\\tHG03833\\tHG03836\\tHG03837\\tHG03838\\tHG03844\\tHG03846\\tHG03848\\tHG03849\\tHG03850\\tHG03851\\tHG03854\\tHG03856\\tHG03857\\tHG03858\\tHG03861\\tHG03862\\tHG03863\\tHG03864\\tHG03866\\tHG03867\\tHG03868\\tHG03869\\tHG03870\\tHG03871\\tHG03872\\tHG03873\\tHG03874\\tHG03875\\tHG03882\\tHG03884\\tHG03885\\tHG03886\\tHG03887\\tHG03888\\tHG03890\\tHG03894\\tHG03895\\tHG03896\\tHG03897\\tHG03898\\tHG03899\\tHG03900\\tHG03902\\tHG03905\\tHG03907\\tHG03908\\tHG03910\\tHG03911\\tHG03913\\tHG03914\\tHG03916\\tHG03917\\tHG03919\\tHG03920\\tHG03922\\tHG03925\\tHG03926\\tHG03928\\tHG03931\\tHG03934\\tHG03937\\tHG03940\\tHG03941\\tHG03943\\tHG03944\\tHG03945\\tHG03947\\tHG03949\\tHG03950\\tHG03951\\tHG03953\\tHG03955\\tHG03960\\tHG03963\\tHG03965\\tHG03967\\tHG03968\\tHG03969\\tHG03971\\tHG03973\\tHG03974\\tHG03976\\tHG03977\\tHG03978\\tHG03985\\tHG03986\\tHG03989\\tHG03990\\tHG03991\\tHG03995\\tHG03998\\tHG03999\\tHG04001\\tHG04002\\tHG04003\\tHG04006\\tHG04014\\tHG04015\\tHG04017\\tHG04018\\tHG04019\\tHG04020\\tHG04022\\tHG04023\\tHG04025\\tHG04026\\tHG04029\\tHG04033\\tHG04035\\tHG04038\\tHG04039\\tHG04042\\tHG04047\\tHG04054\\tHG04056\\tHG04059\\tHG04060\\tHG04061\\tHG04062\\tHG04063\\tHG04070\\tHG04075\\tHG04076\\tHG04080\\tHG04090\\tHG04093\\tHG04094\\tHG04096\\tHG04098\\tHG04099\\tHG04100\\tHG04106\\tHG04107\\tHG04118\\tHG04131\\tHG04134\\tHG04140\\tHG04141\\tHG04144\\tHG04146\\tHG04152\\tHG04153\\tHG04155\\tHG04156\\tHG04158\\tHG04159\\tHG04161\\tHG04162\\tHG04164\\tHG04171\\tHG04173\\tHG04176\\tHG04177\\tHG04180\\tHG04182\\tHG04183\\tHG04185\\tHG04186\\tHG04188\\tHG04189\\tHG04194\\tHG04195\\tHG04198\\tHG04200\\tHG04202\\tHG04206\\tHG04209\\tHG04210\\tHG04211\\tHG04212\\tHG04214\\tHG04216\\tHG04219\\tHG04222\\tHG04225\\tHG04227\\tHG04229\\tHG04235\\tHG04238\\tHG04239\\tNA06984\\tNA06985\\tNA06986\\tNA06989\\tNA06994\\tNA07000\\tNA07037\\tNA07048\\tNA07051\\tNA07056\\tNA07347\\tNA07357\\tNA10847\\tNA10851\\tNA11829\\tNA11830\\tNA11831\\tNA11832\\tNA11840\\tNA11843\\tNA11881\\tNA11892\\tNA11893\\tNA11894\\tNA11918\\tNA11919\\tNA11920\\tNA11930\\tNA11931\\tNA11932\\tNA11933\\tNA11992\\tNA11994\\tNA11995\\tNA12003\\tNA12004\\tNA12005\\tNA12006\\tNA12043\\tNA12044\\tNA12045\\tNA12046\\tNA12058\\tNA12144\\tNA12154\\tNA12155\\tNA12156\\tNA12234\\tNA12249\\tNA12272\\tNA12273\\tNA12275\\tNA12282\\tNA12283\\tNA12286\\tNA12287\\tNA12340\\tNA12341\\tNA12342\\tNA12347\\tNA12348\\tNA12383\\tNA12399\\tNA12400\\tNA12413\\tNA12414\\tNA12489\\tNA12546\\tNA12716\\tNA12717\\tNA12718\\tNA12748\\tNA12749\\tNA12750\\tNA12751\\tNA12760\\tNA12761\\tNA12762\\tNA12763\\tNA12775\\tNA12776\\tNA12777\\tNA12778\\tNA12812\\tNA12813\\tNA12814\\tNA12815\\tNA12827\\tNA12828\\tNA12829\\tNA12830\\tNA12842\\tNA12843\\tNA12872\\tNA12873\\tNA12874\\tNA12878\\tNA12889\\tNA12890\\tNA18486\\tNA18488\\tNA18489\\tNA18498\\tNA18499\\tNA18501\\tNA18502\\tNA18504\\tNA18505\\tNA18507\\tNA18508\\tNA18510\\tNA18511\\tNA18516\\tNA18517\\tNA18519\\tNA18520\\tNA18522\\tNA18523\\tNA18525\\tNA18526\\tNA18528\\tNA18530\\tNA18531\\tNA18532\\tNA18533\\tNA18534\\tNA18535\\tNA18536\\tNA18537\\tNA18538\\tNA18539\\tNA18541\\tNA18542\\tNA18543\\tNA18544\\tNA18545\\tNA18546\\tNA18547\\tNA18548\\tNA18549\\tNA18550\\tNA18552\\tNA18553\\tNA18555\\tNA18557\\tNA18558\\tNA18559\\tNA18560\\tNA18561\\tNA18562\\tNA18563\\tNA18564\\tNA18565\\tNA18566\\tNA18567\\tNA18570\\tNA18571\\tNA18572\\tNA18573\\tNA18574\\tNA18577\\tNA18579\\tNA18582\\tNA18591\\tNA18592\\tNA18593\\tNA18595\\tNA18596\\tNA18597\\tNA18599\\tNA18602\\tNA18603\\tNA18605\\tNA18606\\tNA18608\\tNA18609\\tNA18610\\tNA18611\\tNA18612\\tNA18613\\tNA18614\\tNA18615\\tNA18616\\tNA18617\\tNA18618\\tNA18619\\tNA18620\\tNA18621\\tNA18622\\tNA18623\\tNA18624\\tNA18625\\tNA18626\\tNA18627\\tNA18628\\tNA18629\\tNA18630\\tNA18631\\tNA18632\\tNA18633\\tNA18634\\tNA18635\\tNA18636\\tNA18637\\tNA18638\\tNA18639\\tNA18640\\tNA18641\\tNA18642\\tNA18643\\tNA18644\\tNA18645\\tNA18646\\tNA18647\\tNA18648\\tNA18740\\tNA18745\\tNA18747\\tNA18748\\tNA18749\\tNA18757\\tNA18853\\tNA18856\\tNA18858\\tNA18861\\tNA18864\\tNA18865\\tNA18867\\tNA18868\\tNA18870\\tNA18871\\tNA18873\\tNA18874\\tNA18876\\tNA18877\\tNA18878\\tNA18879\\tNA18881\\tNA18907\\tNA18908\\tNA18909\\tNA18910\\tNA18912\\tNA18915\\tNA18916\\tNA18917\\tNA18923\\tNA18924\\tNA18933\\tNA18934\\tNA18939\\tNA18940\\tNA18941\\tNA18942\\tNA18943\\tNA18944\\tNA18945\\tNA18946\\tNA18947\\tNA18948\\tNA18949\\tNA18950\\tNA18951\\tNA18952\\tNA18953\\tNA18954\\tNA18956\\tNA18957\\tNA18959\\tNA18960\\tNA18961\\tNA18962\\tNA18963\\tNA18964\\tNA18965\\tNA18966\\tNA18967\\tNA18968\\tNA18969\\tNA18970\\tNA18971\\tNA18972\\tNA18973\\tNA18974\\tNA18975\\tNA18976\\tNA18977\\tNA18978\\tNA18979\\tNA18980\\tNA18981\\tNA18982\\tNA18983\\tNA18984\\tNA18985\\tNA18986\\tNA18987\\tNA18988\\tNA18989\\tNA18990\\tNA18991\\tNA18992\\tNA18993\\tNA18994\\tNA18995\\tNA18997\\tNA18998\\tNA18999\\tNA19000\\tNA19001\\tNA19002\\tNA19003\\tNA19004\\tNA19005\\tNA19006\\tNA19007\\tNA19009\\tNA19010\\tNA19011\\tNA19012\\tNA19017\\tNA19019\\tNA19020\\tNA19023\\tNA19024\\tNA19025\\tNA19026\\tNA19027\\tNA19028\\tNA19030\\tNA19031\\tNA19035\\tNA19036\\tNA19037\\tNA19038\\tNA19041\\tNA19042\\tNA19043\\tNA19054\\tNA19055\\tNA19056\\tNA19057\\tNA19058\\tNA19059\\tNA19060\\tNA19062\\tNA19063\\tNA19064\\tNA19065\\tNA19066\\tNA19067\\tNA19068\\tNA19070\\tNA19072\\tNA19074\\tNA19075\\tNA19076\\tNA19077\\tNA19078\\tNA19079\\tNA19080\\tNA19081\\tNA19082\\tNA19083\\tNA19084\\tNA19085\\tNA19086\\tNA19087\\tNA19088\\tNA19089\\tNA19090\\tNA19091\\tNA19092\\tNA19093\\tNA19095\\tNA19096\\tNA19098\\tNA19099\\tNA19102\\tNA19107\\tNA19108\\tNA19113\\tNA19114\\tNA19116\\tNA19117\\tNA19118\\tNA19119\\tNA19121\\tNA19129\\tNA19130\\tNA19131\\tNA19137\\tNA19138\\tNA19141\\tNA19143\\tNA19144\\tNA19146\\tNA19147\\tNA19149\\tNA19152\\tNA19153\\tNA19159\\tNA19160\\tNA19171\\tNA19172\\tNA19175\\tNA19184\\tNA19185\\tNA19189\\tNA19190\\tNA19197\\tNA19198\\tNA19200\\tNA19201\\tNA19204\\tNA19206\\tNA19207\\tNA19209\\tNA19210\\tNA19213\\tNA19214\\tNA19222\\tNA19223\\tNA19225\\tNA19235\\tNA19236\\tNA19238\\tNA19239\\tNA19247\\tNA19248\\tNA19256\\tNA19257\\tNA19307\\tNA19308\\tNA19309\\tNA19310\\tNA19312\\tNA19314\\tNA19315\\tNA19316\\tNA19317\\tNA19318\\tNA19319\\tNA19320\\tNA19321\\tNA19323\\tNA19324\\tNA19327\\tNA19328\\tNA19331\\tNA19332\\tNA19334\\tNA19338\\tNA19346\\tNA19347\\tNA19350\\tNA19351\\tNA19355\\tNA19360\\tNA19372\\tNA19374\\tNA19375\\tNA19376\\tNA19377\\tNA19378\\tNA19379\\tNA19380\\tNA19383\\tNA19384\\tNA19385\\tNA19390\\tNA19391\\tNA19393\\tNA19394\\tNA19395\\tNA19397\\tNA19399\\tNA19401\\tNA19403\\tNA19404\\tNA19428\\tNA19429\\tNA19430\\tNA19431\\tNA19434\\tNA19435\\tNA19436\\tNA19437\\tNA19438\\tNA19439\\tNA19440\\tNA19443\\tNA19445\\tNA19446\\tNA19448\\tNA19449\\tNA19451\\tNA19452\\tNA19454\\tNA19455\\tNA19456\\tNA19457\\tNA19461\\tNA19462\\tNA19463\\tNA19466\\tNA19467\\tNA19468\\tNA19471\\tNA19472\\tNA19473\\tNA19474\\tNA19475\\tNA19625\\tNA19648\\tNA19649\\tNA19651\\tNA19652\\tNA19654\\tNA19655\\tNA19657\\tNA19658\\tNA19661\\tNA19663\\tNA19664\\tNA19669\\tNA19670\\tNA19676\\tNA19678\\tNA19679\\tNA19681\\tNA19682\\tNA19684\\tNA19700\\tNA19701\\tNA19703\\tNA19704\\tNA19707\\tNA19711\\tNA19712\\tNA19713\\tNA19716\\tNA19717\\tNA19719\\tNA19720\\tNA19722\\tNA19723\\tNA19725\\tNA19726\\tNA19728\\tNA19729\\tNA19731\\tNA19732\\tNA19734\\tNA19735\\tNA19740\\tNA19741\\tNA19746\\tNA19747\\tNA19749\\tNA19750\\tNA19752\\tNA19755\\tNA19756\\tNA19758\\tNA19759\\tNA19761\\tNA19762\\tNA19764\\tNA19770\\tNA19771\\tNA19773\\tNA19774\\tNA19776\\tNA19777\\tNA19779\\tNA19780\\tNA19782\\tNA19783\\tNA19785\\tNA19786\\tNA19788\\tNA19789\\tNA19792\\tNA19794\\tNA19795\\tNA19818\\tNA19819\\tNA19834\\tNA19835\\tNA19900\\tNA19901\\tNA19904\\tNA19908\\tNA19909\\tNA19913\\tNA19914\\tNA19916\\tNA19917\\tNA19920\\tNA19921\\tNA19922\\tNA19923\\tNA19982\\tNA19984\\tNA20126\\tNA20127\\tNA20274\\tNA20276\\tNA20278\\tNA20281\\tNA20282\\tNA20287\\tNA20289\\tNA20291\\tNA20294\\tNA20296\\tNA20298\\tNA20299\\tNA20314\\tNA20317\\tNA20318\\tNA20320\\tNA20321\\tNA20332\\tNA20334\\tNA20339\\tNA20340\\tNA20342\\tNA20346\\tNA20348\\tNA20351\\tNA20355\\tNA20356\\tNA20357\\tNA20359\\tNA20362\\tNA20412\\tNA20502\\tNA20503\\tNA20504\\tNA20505\\tNA20506\\tNA20507\\tNA20508\\tNA20509\\tNA20510\\tNA20511\\tNA20512\\tNA20513\\tNA20514\\tNA20515\\tNA20516\\tNA20517\\tNA20518\\tNA20519\\tNA20520\\tNA20521\\tNA20522\\tNA20524\\tNA20525\\tNA20527\\tNA20528\\tNA20529\\tNA20530\\tNA20531\\tNA20532\\tNA20533\\tNA20534\\tNA20535\\tNA20536\\tNA20538\\tNA20539\\tNA20540\\tNA20541\\tNA20542\\tNA20543\\tNA20544\\tNA20581\\tNA20582\\tNA20585\\tNA20586\\tNA20587\\tNA20588\\tNA20589\\tNA20752\\tNA20753\\tNA20754\\tNA20755\\tNA20756\\tNA20757\\tNA20758\\tNA20759\\tNA20760\\tNA20761\\tNA20762\\tNA20763\\tNA20764\\tNA20765\\tNA20766\\tNA20767\\tNA20768\\tNA20769\\tNA20770\\tNA20771\\tNA20772\\tNA20773\\tNA20774\\tNA20775\\tNA20778\\tNA20783\\tNA20785\\tNA20786\\tNA20787\\tNA20790\\tNA20792\\tNA20795\\tNA20796\\tNA20797\\tNA20798\\tNA20799\\tNA20800\\tNA20801\\tNA20802\\tNA20803\\tNA20804\\tNA20805\\tNA20806\\tNA20807\\tNA20808\\tNA20809\\tNA20810\\tNA20811\\tNA20812\\tNA20813\\tNA20814\\tNA20815\\tNA20818\\tNA20819\\tNA20821\\tNA20822\\tNA20826\\tNA20827\\tNA20828\\tNA20832\\tNA20845\\tNA20846\\tNA20847\\tNA20849\\tNA20850\\tNA20851\\tNA20852\\tNA20853\\tNA20854\\tNA20856\\tNA20858\\tNA20859\\tNA20861\\tNA20862\\tNA20863\\tNA20864\\tNA20866\\tNA20867\\tNA20868\\tNA20869\\tNA20870\\tNA20872\\tNA20874\\tNA20875\\tNA20876\\tNA20877\\tNA20878\\tNA20881\\tNA20882\\tNA20884\\tNA20885\\tNA20886\\tNA20887\\tNA20888\\tNA20889\\tNA20890\\tNA20891\\tNA20892\\tNA20894\\tNA20895\\tNA20896\\tNA20897\\tNA20899\\tNA20900\\tNA20901\\tNA20902\\tNA20903\\tNA20904\\tNA20905\\tNA20906\\tNA20908\\tNA20910\\tNA20911\\tNA21086\\tNA21087\\tNA21088\\tNA21089\\tNA21090\\tNA21091\\tNA21092\\tNA21093\\tNA21094\\tNA21095\\tNA21097\\tNA21098\\tNA21099\\tNA21100\\tNA21101\\tNA21102\\tNA21103\\tNA21104\\tNA21105\\tNA21106\\tNA21107\\tNA21108\\tNA21109\\tNA21110\\tNA21111\\tNA21112\\tNA21113\\tNA21114\\tNA21115\\tNA21116\\tNA21117\\tNA21118\\tNA21119\\tNA21120\\tNA21122\\tNA21123\\tNA21124\\tNA21125\\tNA21126\\tNA21127\\tNA21128\\tNA21129\\tNA21130\\tNA21133\\tNA21135\\tNA21137\\tNA21141\\tNA21142\\tNA21143\\tNA21144\\n']" |
|
|
313 |
] |
|
|
314 |
}, |
|
|
315 |
"execution_count": 21, |
|
|
316 |
"metadata": {}, |
|
|
317 |
"output_type": "execute_result" |
|
|
318 |
} |
|
|
319 |
], |
|
|
320 |
"source": [ |
|
|
321 |
"meta" |
|
|
322 |
] |
|
|
323 |
}, |
|
|
324 |
{ |
|
|
325 |
"cell_type": "code", |
|
|
326 |
"execution_count": 2, |
|
|
327 |
"metadata": {}, |
|
|
328 |
"outputs": [], |
|
|
329 |
"source": [ |
|
|
330 |
"data = pd.read_csv(\"/Users/judydu/Desktop/AI4All/data/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes_LDsubset.vcf\",\n", |
|
|
331 |
" sep='\\t', comment = \"#\",\n", |
|
|
332 |
" skiprows = 251, index_col = False, header = None)" |
|
|
333 |
] |
|
|
334 |
}, |
|
|
335 |
{ |
|
|
336 |
"cell_type": "code", |
|
|
337 |
"execution_count": 19, |
|
|
338 |
"metadata": {}, |
|
|
339 |
"outputs": [ |
|
|
340 |
{ |
|
|
341 |
"data": { |
|
|
342 |
"text/plain": [ |
|
|
343 |
"(1135, 2513)" |
|
|
344 |
] |
|
|
345 |
}, |
|
|
346 |
"execution_count": 19, |
|
|
347 |
"metadata": {}, |
|
|
348 |
"output_type": "execute_result" |
|
|
349 |
} |
|
|
350 |
], |
|
|
351 |
"source": [ |
|
|
352 |
"data.shape" |
|
|
353 |
] |
|
|
354 |
}, |
|
|
355 |
{ |
|
|
356 |
"cell_type": "code", |
|
|
357 |
"execution_count": 22, |
|
|
358 |
"metadata": {}, |
|
|
359 |
"outputs": [], |
|
|
360 |
"source": [ |
|
|
361 |
"data.columns = meta[len(meta)-1].split(\"\\t\")" |
|
|
362 |
] |
|
|
363 |
}, |
|
|
364 |
{ |
|
|
365 |
"cell_type": "code", |
|
|
366 |
"execution_count": 23, |
|
|
367 |
"metadata": { |
|
|
368 |
"scrolled": false |
|
|
369 |
}, |
|
|
370 |
"outputs": [ |
|
|
371 |
{ |
|
|
372 |
"data": { |
|
|
373 |
"text/html": [ |
|
|
374 |
"<div>\n", |
|
|
375 |
"<style scoped>\n", |
|
|
376 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
377 |
" vertical-align: middle;\n", |
|
|
378 |
" }\n", |
|
|
379 |
"\n", |
|
|
380 |
" .dataframe tbody tr th {\n", |
|
|
381 |
" vertical-align: top;\n", |
|
|
382 |
" }\n", |
|
|
383 |
"\n", |
|
|
384 |
" .dataframe thead th {\n", |
|
|
385 |
" text-align: right;\n", |
|
|
386 |
" }\n", |
|
|
387 |
"</style>\n", |
|
|
388 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
389 |
" <thead>\n", |
|
|
390 |
" <tr style=\"text-align: right;\">\n", |
|
|
391 |
" <th></th>\n", |
|
|
392 |
" <th>#CHROM</th>\n", |
|
|
393 |
" <th>POS</th>\n", |
|
|
394 |
" <th>ID</th>\n", |
|
|
395 |
" <th>REF</th>\n", |
|
|
396 |
" <th>ALT</th>\n", |
|
|
397 |
" <th>QUAL</th>\n", |
|
|
398 |
" <th>FILTER</th>\n", |
|
|
399 |
" <th>INFO</th>\n", |
|
|
400 |
" <th>FORMAT</th>\n", |
|
|
401 |
" <th>HG00096</th>\n", |
|
|
402 |
" <th>...</th>\n", |
|
|
403 |
" <th>NA21128</th>\n", |
|
|
404 |
" <th>NA21129</th>\n", |
|
|
405 |
" <th>NA21130</th>\n", |
|
|
406 |
" <th>NA21133</th>\n", |
|
|
407 |
" <th>NA21135</th>\n", |
|
|
408 |
" <th>NA21137</th>\n", |
|
|
409 |
" <th>NA21141</th>\n", |
|
|
410 |
" <th>NA21142</th>\n", |
|
|
411 |
" <th>NA21143</th>\n", |
|
|
412 |
" <th>NA21144</th>\n", |
|
|
413 |
" </tr>\n", |
|
|
414 |
" </thead>\n", |
|
|
415 |
" <tbody>\n", |
|
|
416 |
" <tr>\n", |
|
|
417 |
" <th>0</th>\n", |
|
|
418 |
" <td>1</td>\n", |
|
|
419 |
" <td>1005806</td>\n", |
|
|
420 |
" <td>rs3934834</td>\n", |
|
|
421 |
" <td>C</td>\n", |
|
|
422 |
" <td>T</td>\n", |
|
|
423 |
" <td>100</td>\n", |
|
|
424 |
" <td>PASS</td>\n", |
|
|
425 |
" <td>AA=C|||;AC=1119;AF=0.223442;AFR_AF=0.3941;AMR_...</td>\n", |
|
|
426 |
" <td>GT</td>\n", |
|
|
427 |
" <td>0|0</td>\n", |
|
|
428 |
" <td>...</td>\n", |
|
|
429 |
" <td>0|0</td>\n", |
|
|
430 |
" <td>0|0</td>\n", |
|
|
431 |
" <td>0|1</td>\n", |
|
|
432 |
" <td>0|1</td>\n", |
|
|
433 |
" <td>0|0</td>\n", |
|
|
434 |
" <td>1|0</td>\n", |
|
|
435 |
" <td>1|0</td>\n", |
|
|
436 |
" <td>0|0</td>\n", |
|
|
437 |
" <td>0|0</td>\n", |
|
|
438 |
" <td>1|0</td>\n", |
|
|
439 |
" </tr>\n", |
|
|
440 |
" <tr>\n", |
|
|
441 |
" <th>1</th>\n", |
|
|
442 |
" <td>1</td>\n", |
|
|
443 |
" <td>1079198</td>\n", |
|
|
444 |
" <td>rs11260603</td>\n", |
|
|
445 |
" <td>T</td>\n", |
|
|
446 |
" <td>C</td>\n", |
|
|
447 |
" <td>100</td>\n", |
|
|
448 |
" <td>PASS</td>\n", |
|
|
449 |
" <td>AA=c|||;AC=1520;AF=0.303514;AFR_AF=0.6271;AMR_...</td>\n", |
|
|
450 |
" <td>GT</td>\n", |
|
|
451 |
" <td>0|1</td>\n", |
|
|
452 |
" <td>...</td>\n", |
|
|
453 |
" <td>0|0</td>\n", |
|
|
454 |
" <td>0|0</td>\n", |
|
|
455 |
" <td>0|0</td>\n", |
|
|
456 |
" <td>0|0</td>\n", |
|
|
457 |
" <td>0|0</td>\n", |
|
|
458 |
" <td>0|0</td>\n", |
|
|
459 |
" <td>0|0</td>\n", |
|
|
460 |
" <td>0|0</td>\n", |
|
|
461 |
" <td>0|1</td>\n", |
|
|
462 |
" <td>0|0</td>\n", |
|
|
463 |
" </tr>\n", |
|
|
464 |
" <tr>\n", |
|
|
465 |
" <th>2</th>\n", |
|
|
466 |
" <td>1</td>\n", |
|
|
467 |
" <td>1247494</td>\n", |
|
|
468 |
" <td>rs12103</td>\n", |
|
|
469 |
" <td>T</td>\n", |
|
|
470 |
" <td>C</td>\n", |
|
|
471 |
" <td>100</td>\n", |
|
|
472 |
" <td>PASS</td>\n", |
|
|
473 |
" <td>AA=T|||;AC=1599;AF=0.319289;AFR_AF=0.0923;AMR_...</td>\n", |
|
|
474 |
" <td>GT</td>\n", |
|
|
475 |
" <td>1|0</td>\n", |
|
|
476 |
" <td>...</td>\n", |
|
|
477 |
" <td>1|0</td>\n", |
|
|
478 |
" <td>1|0</td>\n", |
|
|
479 |
" <td>0|0</td>\n", |
|
|
480 |
" <td>0|0</td>\n", |
|
|
481 |
" <td>1|1</td>\n", |
|
|
482 |
" <td>0|1</td>\n", |
|
|
483 |
" <td>0|0</td>\n", |
|
|
484 |
" <td>0|0</td>\n", |
|
|
485 |
" <td>0|0</td>\n", |
|
|
486 |
" <td>0|1</td>\n", |
|
|
487 |
" </tr>\n", |
|
|
488 |
" <tr>\n", |
|
|
489 |
" <th>3</th>\n", |
|
|
490 |
" <td>1</td>\n", |
|
|
491 |
" <td>2069172</td>\n", |
|
|
492 |
" <td>rs425277</td>\n", |
|
|
493 |
" <td>C</td>\n", |
|
|
494 |
" <td>T</td>\n", |
|
|
495 |
" <td>100</td>\n", |
|
|
496 |
" <td>PASS</td>\n", |
|
|
497 |
" <td>AA=C|||;AC=1128;AF=0.22524;AFR_AF=0.0666;AMR_A...</td>\n", |
|
|
498 |
" <td>GT</td>\n", |
|
|
499 |
" <td>1|0</td>\n", |
|
|
500 |
" <td>...</td>\n", |
|
|
501 |
" <td>0|0</td>\n", |
|
|
502 |
" <td>1|0</td>\n", |
|
|
503 |
" <td>1|0</td>\n", |
|
|
504 |
" <td>0|0</td>\n", |
|
|
505 |
" <td>0|1</td>\n", |
|
|
506 |
" <td>0|0</td>\n", |
|
|
507 |
" <td>0|1</td>\n", |
|
|
508 |
" <td>0|0</td>\n", |
|
|
509 |
" <td>0|1</td>\n", |
|
|
510 |
" <td>0|0</td>\n", |
|
|
511 |
" </tr>\n", |
|
|
512 |
" <tr>\n", |
|
|
513 |
" <th>4</th>\n", |
|
|
514 |
" <td>1</td>\n", |
|
|
515 |
" <td>2069681</td>\n", |
|
|
516 |
" <td>rs3753242</td>\n", |
|
|
517 |
" <td>C</td>\n", |
|
|
518 |
" <td>T</td>\n", |
|
|
519 |
" <td>100</td>\n", |
|
|
520 |
" <td>PASS</td>\n", |
|
|
521 |
" <td>AA=C|||;AC=943;AF=0.188299;AFR_AF=0.0197;AMR_A...</td>\n", |
|
|
522 |
" <td>GT</td>\n", |
|
|
523 |
" <td>0|0</td>\n", |
|
|
524 |
" <td>...</td>\n", |
|
|
525 |
" <td>0|1</td>\n", |
|
|
526 |
" <td>0|0</td>\n", |
|
|
527 |
" <td>0|0</td>\n", |
|
|
528 |
" <td>0|0</td>\n", |
|
|
529 |
" <td>1|0</td>\n", |
|
|
530 |
" <td>0|0</td>\n", |
|
|
531 |
" <td>1|0</td>\n", |
|
|
532 |
" <td>0|1</td>\n", |
|
|
533 |
" <td>0|0</td>\n", |
|
|
534 |
" <td>0|0</td>\n", |
|
|
535 |
" </tr>\n", |
|
|
536 |
" </tbody>\n", |
|
|
537 |
"</table>\n", |
|
|
538 |
"<p>5 rows × 2513 columns</p>\n", |
|
|
539 |
"</div>" |
|
|
540 |
], |
|
|
541 |
"text/plain": [ |
|
|
542 |
" #CHROM POS ID REF ALT QUAL FILTER \\\n", |
|
|
543 |
"0 1 1005806 rs3934834 C T 100 PASS \n", |
|
|
544 |
"1 1 1079198 rs11260603 T C 100 PASS \n", |
|
|
545 |
"2 1 1247494 rs12103 T C 100 PASS \n", |
|
|
546 |
"3 1 2069172 rs425277 C T 100 PASS \n", |
|
|
547 |
"4 1 2069681 rs3753242 C T 100 PASS \n", |
|
|
548 |
"\n", |
|
|
549 |
" INFO FORMAT HG00096 ... \\\n", |
|
|
550 |
"0 AA=C|||;AC=1119;AF=0.223442;AFR_AF=0.3941;AMR_... GT 0|0 ... \n", |
|
|
551 |
"1 AA=c|||;AC=1520;AF=0.303514;AFR_AF=0.6271;AMR_... GT 0|1 ... \n", |
|
|
552 |
"2 AA=T|||;AC=1599;AF=0.319289;AFR_AF=0.0923;AMR_... GT 1|0 ... \n", |
|
|
553 |
"3 AA=C|||;AC=1128;AF=0.22524;AFR_AF=0.0666;AMR_A... GT 1|0 ... \n", |
|
|
554 |
"4 AA=C|||;AC=943;AF=0.188299;AFR_AF=0.0197;AMR_A... GT 0|0 ... \n", |
|
|
555 |
"\n", |
|
|
556 |
" NA21128 NA21129 NA21130 NA21133 NA21135 NA21137 NA21141 NA21142 NA21143 \\\n", |
|
|
557 |
"0 0|0 0|0 0|1 0|1 0|0 1|0 1|0 0|0 0|0 \n", |
|
|
558 |
"1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|1 \n", |
|
|
559 |
"2 1|0 1|0 0|0 0|0 1|1 0|1 0|0 0|0 0|0 \n", |
|
|
560 |
"3 0|0 1|0 1|0 0|0 0|1 0|0 0|1 0|0 0|1 \n", |
|
|
561 |
"4 0|1 0|0 0|0 0|0 1|0 0|0 1|0 0|1 0|0 \n", |
|
|
562 |
"\n", |
|
|
563 |
" NA21144\\n \n", |
|
|
564 |
"0 1|0 \n", |
|
|
565 |
"1 0|0 \n", |
|
|
566 |
"2 0|1 \n", |
|
|
567 |
"3 0|0 \n", |
|
|
568 |
"4 0|0 \n", |
|
|
569 |
"\n", |
|
|
570 |
"[5 rows x 2513 columns]" |
|
|
571 |
] |
|
|
572 |
}, |
|
|
573 |
"execution_count": 23, |
|
|
574 |
"metadata": {}, |
|
|
575 |
"output_type": "execute_result" |
|
|
576 |
} |
|
|
577 |
], |
|
|
578 |
"source": [ |
|
|
579 |
"data.head()" |
|
|
580 |
] |
|
|
581 |
}, |
|
|
582 |
{ |
|
|
583 |
"cell_type": "code", |
|
|
584 |
"execution_count": 34, |
|
|
585 |
"metadata": {}, |
|
|
586 |
"outputs": [], |
|
|
587 |
"source": [ |
|
|
588 |
"trainingX = data.iloc[:,9:2513]" |
|
|
589 |
] |
|
|
590 |
}, |
|
|
591 |
{ |
|
|
592 |
"cell_type": "markdown", |
|
|
593 |
"metadata": {}, |
|
|
594 |
"source": [ |
|
|
595 |
"## Split data randomly into training and testing sets\n", |
|
|
596 |
"- Later, we may want to consider balancing the training and testing sets as a function of y (geographical origin)" |
|
|
597 |
] |
|
|
598 |
}, |
|
|
599 |
{ |
|
|
600 |
"cell_type": "code", |
|
|
601 |
"execution_count": null, |
|
|
602 |
"metadata": {}, |
|
|
603 |
"outputs": [], |
|
|
604 |
"source": [ |
|
|
605 |
"# Naive label-encoding of genotypes\n", |
|
|
606 |
"align X and Y\n", |
|
|
607 |
"label encode correct subset of X\n", |
|
|
608 |
"\n", |
|
|
609 |
"# subset filename for chromosome number\n", |
|
|
610 |
"chrom = path[chrom].split(\".\")[]" |
|
|
611 |
] |
|
|
612 |
}, |
|
|
613 |
{ |
|
|
614 |
"cell_type": "code", |
|
|
615 |
"execution_count": null, |
|
|
616 |
"metadata": {}, |
|
|
617 |
"outputs": [], |
|
|
618 |
"source": [ |
|
|
619 |
"trainX = {}\n", |
|
|
620 |
"trainY = {}\n", |
|
|
621 |
"testX = {}\n", |
|
|
622 |
"testY = {}" |
|
|
623 |
] |
|
|
624 |
}, |
|
|
625 |
{ |
|
|
626 |
"cell_type": "code", |
|
|
627 |
"execution_count": null, |
|
|
628 |
"metadata": {}, |
|
|
629 |
"outputs": [], |
|
|
630 |
"source": [ |
|
|
631 |
"## randomly split into training/testing\n", |
|
|
632 |
"train_test_split(test_size = .2, random_state = 7, shuffle = True)" |
|
|
633 |
] |
|
|
634 |
}, |
|
|
635 |
{ |
|
|
636 |
"cell_type": "markdown", |
|
|
637 |
"metadata": {}, |
|
|
638 |
"source": [ |
|
|
639 |
"## feature selection via Chi-squared test of indep" |
|
|
640 |
] |
|
|
641 |
}, |
|
|
642 |
{ |
|
|
643 |
"cell_type": "code", |
|
|
644 |
"execution_count": null, |
|
|
645 |
"metadata": {}, |
|
|
646 |
"outputs": [], |
|
|
647 |
"source": [ |
|
|
648 |
"chi2Statistic,pvals = chi2(trainX[chrom], trainY[chrom])\n", |
|
|
649 |
"trainX[chrom + \"_chi2\"] = trainX[chrom].filter()" |
|
|
650 |
] |
|
|
651 |
}, |
|
|
652 |
{ |
|
|
653 |
"cell_type": "markdown", |
|
|
654 |
"metadata": {}, |
|
|
655 |
"source": [ |
|
|
656 |
"# Feature Selection and Data Processing\n", |
|
|
657 |
"1. Label encode discrete features not listed as continuous in metadata (chunk 1)\n", |
|
|
658 |
"2. Normalize cont'd features: mean center and divide by norm w.r.t. training samples (chunk 2)\n", |
|
|
659 |
"3. Mutual information regression feature selection (chunk 3-4)" |
|
|
660 |
] |
|
|
661 |
}, |
|
|
662 |
{ |
|
|
663 |
"cell_type": "code", |
|
|
664 |
"execution_count": 35, |
|
|
665 |
"metadata": {}, |
|
|
666 |
"outputs": [], |
|
|
667 |
"source": [ |
|
|
668 |
"trainingX = trainingX.apply(LabelEncoder().fit_transform)\n", |
|
|
669 |
"\n", |
|
|
670 |
"#preprocessing.LabelEncoder().fit_transform(trainingX)" |
|
|
671 |
] |
|
|
672 |
}, |
|
|
673 |
{ |
|
|
674 |
"cell_type": "code", |
|
|
675 |
"execution_count": 36, |
|
|
676 |
"metadata": {}, |
|
|
677 |
"outputs": [ |
|
|
678 |
{ |
|
|
679 |
"data": { |
|
|
680 |
"text/html": [ |
|
|
681 |
"<div>\n", |
|
|
682 |
"<style scoped>\n", |
|
|
683 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
684 |
" vertical-align: middle;\n", |
|
|
685 |
" }\n", |
|
|
686 |
"\n", |
|
|
687 |
" .dataframe tbody tr th {\n", |
|
|
688 |
" vertical-align: top;\n", |
|
|
689 |
" }\n", |
|
|
690 |
"\n", |
|
|
691 |
" .dataframe thead th {\n", |
|
|
692 |
" text-align: right;\n", |
|
|
693 |
" }\n", |
|
|
694 |
"</style>\n", |
|
|
695 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
696 |
" <thead>\n", |
|
|
697 |
" <tr style=\"text-align: right;\">\n", |
|
|
698 |
" <th></th>\n", |
|
|
699 |
" <th>HG00096</th>\n", |
|
|
700 |
" <th>HG00097</th>\n", |
|
|
701 |
" <th>HG00099</th>\n", |
|
|
702 |
" <th>HG00100</th>\n", |
|
|
703 |
" <th>HG00101</th>\n", |
|
|
704 |
" <th>HG00102</th>\n", |
|
|
705 |
" <th>HG00103</th>\n", |
|
|
706 |
" <th>HG00105</th>\n", |
|
|
707 |
" <th>HG00106</th>\n", |
|
|
708 |
" <th>HG00107</th>\n", |
|
|
709 |
" <th>...</th>\n", |
|
|
710 |
" <th>NA21128</th>\n", |
|
|
711 |
" <th>NA21129</th>\n", |
|
|
712 |
" <th>NA21130</th>\n", |
|
|
713 |
" <th>NA21133</th>\n", |
|
|
714 |
" <th>NA21135</th>\n", |
|
|
715 |
" <th>NA21137</th>\n", |
|
|
716 |
" <th>NA21141</th>\n", |
|
|
717 |
" <th>NA21142</th>\n", |
|
|
718 |
" <th>NA21143</th>\n", |
|
|
719 |
" <th>NA21144</th>\n", |
|
|
720 |
" </tr>\n", |
|
|
721 |
" </thead>\n", |
|
|
722 |
" <tbody>\n", |
|
|
723 |
" <tr>\n", |
|
|
724 |
" <th>0</th>\n", |
|
|
725 |
" <td>0</td>\n", |
|
|
726 |
" <td>1</td>\n", |
|
|
727 |
" <td>0</td>\n", |
|
|
728 |
" <td>2</td>\n", |
|
|
729 |
" <td>0</td>\n", |
|
|
730 |
" <td>0</td>\n", |
|
|
731 |
" <td>0</td>\n", |
|
|
732 |
" <td>0</td>\n", |
|
|
733 |
" <td>0</td>\n", |
|
|
734 |
" <td>0</td>\n", |
|
|
735 |
" <td>...</td>\n", |
|
|
736 |
" <td>0</td>\n", |
|
|
737 |
" <td>0</td>\n", |
|
|
738 |
" <td>1</td>\n", |
|
|
739 |
" <td>1</td>\n", |
|
|
740 |
" <td>0</td>\n", |
|
|
741 |
" <td>2</td>\n", |
|
|
742 |
" <td>2</td>\n", |
|
|
743 |
" <td>0</td>\n", |
|
|
744 |
" <td>0</td>\n", |
|
|
745 |
" <td>2</td>\n", |
|
|
746 |
" </tr>\n", |
|
|
747 |
" <tr>\n", |
|
|
748 |
" <th>1</th>\n", |
|
|
749 |
" <td>1</td>\n", |
|
|
750 |
" <td>0</td>\n", |
|
|
751 |
" <td>0</td>\n", |
|
|
752 |
" <td>2</td>\n", |
|
|
753 |
" <td>1</td>\n", |
|
|
754 |
" <td>3</td>\n", |
|
|
755 |
" <td>1</td>\n", |
|
|
756 |
" <td>1</td>\n", |
|
|
757 |
" <td>0</td>\n", |
|
|
758 |
" <td>3</td>\n", |
|
|
759 |
" <td>...</td>\n", |
|
|
760 |
" <td>0</td>\n", |
|
|
761 |
" <td>0</td>\n", |
|
|
762 |
" <td>0</td>\n", |
|
|
763 |
" <td>0</td>\n", |
|
|
764 |
" <td>0</td>\n", |
|
|
765 |
" <td>0</td>\n", |
|
|
766 |
" <td>0</td>\n", |
|
|
767 |
" <td>0</td>\n", |
|
|
768 |
" <td>1</td>\n", |
|
|
769 |
" <td>0</td>\n", |
|
|
770 |
" </tr>\n", |
|
|
771 |
" <tr>\n", |
|
|
772 |
" <th>2</th>\n", |
|
|
773 |
" <td>2</td>\n", |
|
|
774 |
" <td>3</td>\n", |
|
|
775 |
" <td>3</td>\n", |
|
|
776 |
" <td>3</td>\n", |
|
|
777 |
" <td>4</td>\n", |
|
|
778 |
" <td>3</td>\n", |
|
|
779 |
" <td>1</td>\n", |
|
|
780 |
" <td>3</td>\n", |
|
|
781 |
" <td>4</td>\n", |
|
|
782 |
" <td>1</td>\n", |
|
|
783 |
" <td>...</td>\n", |
|
|
784 |
" <td>3</td>\n", |
|
|
785 |
" <td>3</td>\n", |
|
|
786 |
" <td>0</td>\n", |
|
|
787 |
" <td>0</td>\n", |
|
|
788 |
" <td>4</td>\n", |
|
|
789 |
" <td>1</td>\n", |
|
|
790 |
" <td>0</td>\n", |
|
|
791 |
" <td>0</td>\n", |
|
|
792 |
" <td>0</td>\n", |
|
|
793 |
" <td>1</td>\n", |
|
|
794 |
" </tr>\n", |
|
|
795 |
" <tr>\n", |
|
|
796 |
" <th>3</th>\n", |
|
|
797 |
" <td>2</td>\n", |
|
|
798 |
" <td>0</td>\n", |
|
|
799 |
" <td>2</td>\n", |
|
|
800 |
" <td>2</td>\n", |
|
|
801 |
" <td>0</td>\n", |
|
|
802 |
" <td>1</td>\n", |
|
|
803 |
" <td>0</td>\n", |
|
|
804 |
" <td>1</td>\n", |
|
|
805 |
" <td>0</td>\n", |
|
|
806 |
" <td>1</td>\n", |
|
|
807 |
" <td>...</td>\n", |
|
|
808 |
" <td>0</td>\n", |
|
|
809 |
" <td>3</td>\n", |
|
|
810 |
" <td>2</td>\n", |
|
|
811 |
" <td>0</td>\n", |
|
|
812 |
" <td>1</td>\n", |
|
|
813 |
" <td>0</td>\n", |
|
|
814 |
" <td>1</td>\n", |
|
|
815 |
" <td>0</td>\n", |
|
|
816 |
" <td>1</td>\n", |
|
|
817 |
" <td>0</td>\n", |
|
|
818 |
" </tr>\n", |
|
|
819 |
" <tr>\n", |
|
|
820 |
" <th>4</th>\n", |
|
|
821 |
" <td>0</td>\n", |
|
|
822 |
" <td>0</td>\n", |
|
|
823 |
" <td>0</td>\n", |
|
|
824 |
" <td>0</td>\n", |
|
|
825 |
" <td>0</td>\n", |
|
|
826 |
" <td>0</td>\n", |
|
|
827 |
" <td>0</td>\n", |
|
|
828 |
" <td>0</td>\n", |
|
|
829 |
" <td>0</td>\n", |
|
|
830 |
" <td>0</td>\n", |
|
|
831 |
" <td>...</td>\n", |
|
|
832 |
" <td>1</td>\n", |
|
|
833 |
" <td>0</td>\n", |
|
|
834 |
" <td>0</td>\n", |
|
|
835 |
" <td>0</td>\n", |
|
|
836 |
" <td>3</td>\n", |
|
|
837 |
" <td>0</td>\n", |
|
|
838 |
" <td>2</td>\n", |
|
|
839 |
" <td>1</td>\n", |
|
|
840 |
" <td>0</td>\n", |
|
|
841 |
" <td>0</td>\n", |
|
|
842 |
" </tr>\n", |
|
|
843 |
" </tbody>\n", |
|
|
844 |
"</table>\n", |
|
|
845 |
"<p>5 rows × 2504 columns</p>\n", |
|
|
846 |
"</div>" |
|
|
847 |
], |
|
|
848 |
"text/plain": [ |
|
|
849 |
" HG00096 HG00097 HG00099 HG00100 HG00101 HG00102 HG00103 HG00105 \\\n", |
|
|
850 |
"0 0 1 0 2 0 0 0 0 \n", |
|
|
851 |
"1 1 0 0 2 1 3 1 1 \n", |
|
|
852 |
"2 2 3 3 3 4 3 1 3 \n", |
|
|
853 |
"3 2 0 2 2 0 1 0 1 \n", |
|
|
854 |
"4 0 0 0 0 0 0 0 0 \n", |
|
|
855 |
"\n", |
|
|
856 |
" HG00106 HG00107 ... NA21128 NA21129 NA21130 NA21133 NA21135 \\\n", |
|
|
857 |
"0 0 0 ... 0 0 1 1 0 \n", |
|
|
858 |
"1 0 3 ... 0 0 0 0 0 \n", |
|
|
859 |
"2 4 1 ... 3 3 0 0 4 \n", |
|
|
860 |
"3 0 1 ... 0 3 2 0 1 \n", |
|
|
861 |
"4 0 0 ... 1 0 0 0 3 \n", |
|
|
862 |
"\n", |
|
|
863 |
" NA21137 NA21141 NA21142 NA21143 NA21144\\n \n", |
|
|
864 |
"0 2 2 0 0 2 \n", |
|
|
865 |
"1 0 0 0 1 0 \n", |
|
|
866 |
"2 1 0 0 0 1 \n", |
|
|
867 |
"3 0 1 0 1 0 \n", |
|
|
868 |
"4 0 2 1 0 0 \n", |
|
|
869 |
"\n", |
|
|
870 |
"[5 rows x 2504 columns]" |
|
|
871 |
] |
|
|
872 |
}, |
|
|
873 |
"execution_count": 36, |
|
|
874 |
"metadata": {}, |
|
|
875 |
"output_type": "execute_result" |
|
|
876 |
} |
|
|
877 |
], |
|
|
878 |
"source": [ |
|
|
879 |
"trainingX.head()" |
|
|
880 |
] |
|
|
881 |
}, |
|
|
882 |
{ |
|
|
883 |
"cell_type": "code", |
|
|
884 |
"execution_count": 373, |
|
|
885 |
"metadata": {}, |
|
|
886 |
"outputs": [], |
|
|
887 |
"source": [ |
|
|
888 |
"def MIfeatureSelector(trainingX, trainingY, responseString, meta, MIthreshold, testingX):\n", |
|
|
889 |
" # drop featuresY that are NA values. We will not be imputing missing responses.\n", |
|
|
890 |
" trainingY = trainingY.dropna(subset = [responseString])\n", |
|
|
891 |
" trainingY = trainingY[[responseString]]\n", |
|
|
892 |
" \n", |
|
|
893 |
" #Match IDs of X and y\n", |
|
|
894 |
" trainingX = trainingX.loc[trainingY.index,:]\n", |
|
|
895 |
" \n", |
|
|
896 |
" #Match challegeIDs of meta\n", |
|
|
897 |
" meta = meta.loc[meta[\"old_name\"].isin(trainingX)]\n", |
|
|
898 |
" trainingX = trainingX.loc[:,meta.old_name]\n", |
|
|
899 |
" \n", |
|
|
900 |
" #MI regression\n", |
|
|
901 |
" mi = mutual_info_regression(X = trainingX,\n", |
|
|
902 |
" y = trainingY[responseString],\n", |
|
|
903 |
" discrete_features = list(meta[\"discrete\"]), \n", |
|
|
904 |
" random_state = 10)\n", |
|
|
905 |
" trainingX = trainingX.loc[:, mi >= MIthreshold]\n", |
|
|
906 |
" #Return non-NA training responses, training features with matching challengeIDs and appropriate MI\n", |
|
|
907 |
"\n", |
|
|
908 |
" return trainingX, trainingY, testingX[trainingX.columns]" |
|
|
909 |
] |
|
|
910 |
}, |
|
|
911 |
{ |
|
|
912 |
"cell_type": "markdown", |
|
|
913 |
"metadata": {}, |
|
|
914 |
"source": [ |
|
|
915 |
"# PCA, SVM" |
|
|
916 |
] |
|
|
917 |
}, |
|
|
918 |
{ |
|
|
919 |
"cell_type": "code", |
|
|
920 |
"execution_count": null, |
|
|
921 |
"metadata": {}, |
|
|
922 |
"outputs": [], |
|
|
923 |
"source": [ |
|
|
924 |
"def fitPCA(X):\n", |
|
|
925 |
" pca = PCA(n_components = True)\n", |
|
|
926 |
" pca.fit(X)\n", |
|
|
927 |
" #return pca.components_, pca.explained_variance_ratio_\n", |
|
|
928 |
" return pca\n", |
|
|
929 |
"pca_chr = fitPCA(data.iloc[].transpose())" |
|
|
930 |
] |
|
|
931 |
}, |
|
|
932 |
{ |
|
|
933 |
"cell_type": "code", |
|
|
934 |
"execution_count": null, |
|
|
935 |
"metadata": {}, |
|
|
936 |
"outputs": [], |
|
|
937 |
"source": [ |
|
|
938 |
"def fitSVM(X,Y, testX, testY):\n", |
|
|
939 |
" # fit SVM using hinge loss, L2 penalty via SGD\n", |
|
|
940 |
" svm = SGDClassifier(random_state = 7).fit(X,Y)\n", |
|
|
941 |
" return svm, svm.coef_, svm.score(testX, testY)\n", |
|
|
942 |
"\n", |
|
|
943 |
"models = {}; featureWeights = {}; predictionScore = {}\n", |
|
|
944 |
"\n", |
|
|
945 |
"for key in trainXDict:\n", |
|
|
946 |
" newkey = \"SVM_\" + key\n", |
|
|
947 |
" print(newkey)\n", |
|
|
948 |
" models[newkey], featureWeights[newkey], predictionScore[newkey] = fitSVM(\n", |
|
|
949 |
" X = trainX[key].iloc[:,0],\n", |
|
|
950 |
" Y = trainY[key],\n", |
|
|
951 |
" testX = testX[key],\n", |
|
|
952 |
" testY = testY[key]\n", |
|
|
953 |
" )\n", |
|
|
954 |
" models[\"PCA_\" + key] = fitPCA(X = trainX[key])" |
|
|
955 |
] |
|
|
956 |
}, |
|
|
957 |
{ |
|
|
958 |
"cell_type": "markdown", |
|
|
959 |
"metadata": {}, |
|
|
960 |
"source": [ |
|
|
961 |
"# Analysis" |
|
|
962 |
] |
|
|
963 |
}, |
|
|
964 |
{ |
|
|
965 |
"cell_type": "code", |
|
|
966 |
"execution_count": null, |
|
|
967 |
"metadata": {}, |
|
|
968 |
"outputs": [], |
|
|
969 |
"source": [ |
|
|
970 |
"plot histogram pca.explained_variance_ratio_\n", |
|
|
971 |
"plot pca.components_ 0 and 1 color by testing\n", |
|
|
972 |
"\n", |
|
|
973 |
"plot svm.score(testX, testY)\n", |
|
|
974 |
"plot coef name by top 10 svm.coef_ ordering by level \n", |
|
|
975 |
"variation score in chi squared test" |
|
|
976 |
] |
|
|
977 |
} |
|
|
978 |
], |
|
|
979 |
"metadata": { |
|
|
980 |
"kernelspec": { |
|
|
981 |
"display_name": "Python 3", |
|
|
982 |
"language": "python", |
|
|
983 |
"name": "python3" |
|
|
984 |
}, |
|
|
985 |
"language_info": { |
|
|
986 |
"codemirror_mode": { |
|
|
987 |
"name": "ipython", |
|
|
988 |
"version": 3 |
|
|
989 |
}, |
|
|
990 |
"file_extension": ".py", |
|
|
991 |
"mimetype": "text/x-python", |
|
|
992 |
"name": "python", |
|
|
993 |
"nbconvert_exporter": "python", |
|
|
994 |
"pygments_lexer": "ipython3", |
|
|
995 |
"version": "3.7.3" |
|
|
996 |
} |
|
|
997 |
}, |
|
|
998 |
"nbformat": 4, |
|
|
999 |
"nbformat_minor": 2 |
|
|
1000 |
} |