a b/1000genomes_dataExploration.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 28,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "import pandas as pd\n",
10
    "import numpy as np\n",
11
    "import io\n",
12
    "import os\n",
13
    "import matplotlib.pyplot as plot\n",
14
    "from sklearn.preprocessing import LabelEncoder\n",
15
    "from sklearn.feature_selection import chi2\n",
16
    "from sklearn.decomposition import PCA\n",
17
    "from sklearn.linear_model import SGDClassifier"
18
   ]
19
  },
20
  {
21
   "cell_type": "markdown",
22
   "metadata": {},
23
   "source": [
24
    "# Data Import and Processing\n",
25
    "- Haplotype data downloaded from ftp://1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/\n",
26
    "    - Files in vcf format; see variable \"meta\" for description of the data\n",
27
    "- Metadata downloaded from\n",
28
    "http://www.internationalgenome.org/data-portal/sample\n"
29
   ]
30
  },
31
  {
32
   "cell_type": "code",
33
   "execution_count": 18,
34
   "metadata": {},
35
   "outputs": [],
36
   "source": [
37
    "# Data Import\n",
38
    "directory = \"/Users/judydu/Desktop/AI4All/data/\"\n",
39
    "path = os.listdir(directory)\n",
40
    "chrom = 1\n",
41
    "\n",
42
    "origins = pd.read_csv(directory + \"igsr_samples.tsv\",\n",
43
    "                     sep = \"\\t\")\n",
44
    "with open(directory + path[chrom], 'r') as f:\n",
45
    "    meta = [l for l in f if l.startswith('#')]"
46
   ]
47
  },
48
  {
49
   "cell_type": "code",
50
   "execution_count": 21,
51
   "metadata": {},
52
   "outputs": [
53
    {
54
     "data": {
55
      "text/plain": [
56
       "['##fileformat=VCFv4.2\\n',\n",
57
       " '##ALT=<ID=CN0,Description=\"Copy number allele: 0 copies\">\\n',\n",
58
       " '##ALT=<ID=CN1,Description=\"Copy number allele: 1 copy\">\\n',\n",
59
       " '##ALT=<ID=CN10,Description=\"Copy number allele: 10 copies\">\\n',\n",
60
       " '##ALT=<ID=CN100,Description=\"Copy number allele: 100 copies\">\\n',\n",
61
       " '##ALT=<ID=CN101,Description=\"Copy number allele: 101 copies\">\\n',\n",
62
       " '##ALT=<ID=CN102,Description=\"Copy number allele: 102 copies\">\\n',\n",
63
       " '##ALT=<ID=CN103,Description=\"Copy number allele: 103 copies\">\\n',\n",
64
       " '##ALT=<ID=CN104,Description=\"Copy number allele: 104 copies\">\\n',\n",
65
       " '##ALT=<ID=CN105,Description=\"Copy number allele: 105 copies\">\\n',\n",
66
       " '##ALT=<ID=CN106,Description=\"Copy number allele: 106 copies\">\\n',\n",
67
       " '##ALT=<ID=CN107,Description=\"Copy number allele: 107 copies\">\\n',\n",
68
       " '##ALT=<ID=CN108,Description=\"Copy number allele: 108 copies\">\\n',\n",
69
       " '##ALT=<ID=CN109,Description=\"Copy number allele: 109 copies\">\\n',\n",
70
       " '##ALT=<ID=CN11,Description=\"Copy number allele: 11 copies\">\\n',\n",
71
       " '##ALT=<ID=CN110,Description=\"Copy number allele: 110 copies\">\\n',\n",
72
       " '##ALT=<ID=CN111,Description=\"Copy number allele: 111 copies\">\\n',\n",
73
       " '##ALT=<ID=CN112,Description=\"Copy number allele: 112 copies\">\\n',\n",
74
       " '##ALT=<ID=CN113,Description=\"Copy number allele: 113 copies\">\\n',\n",
75
       " '##ALT=<ID=CN114,Description=\"Copy number allele: 114 copies\">\\n',\n",
76
       " '##ALT=<ID=CN115,Description=\"Copy number allele: 115 copies\">\\n',\n",
77
       " '##ALT=<ID=CN116,Description=\"Copy number allele: 116 copies\">\\n',\n",
78
       " '##ALT=<ID=CN117,Description=\"Copy number allele: 117 copies\">\\n',\n",
79
       " '##ALT=<ID=CN118,Description=\"Copy number allele: 118 copies\">\\n',\n",
80
       " '##ALT=<ID=CN119,Description=\"Copy number allele: 119 copies\">\\n',\n",
81
       " '##ALT=<ID=CN12,Description=\"Copy number allele: 12 copies\">\\n',\n",
82
       " '##ALT=<ID=CN120,Description=\"Copy number allele: 120 copies\">\\n',\n",
83
       " '##ALT=<ID=CN121,Description=\"Copy number allele: 121 copies\">\\n',\n",
84
       " '##ALT=<ID=CN122,Description=\"Copy number allele: 122 copies\">\\n',\n",
85
       " '##ALT=<ID=CN123,Description=\"Copy number allele: 123 copies\">\\n',\n",
86
       " '##ALT=<ID=CN124,Description=\"Copy number allele: 124 copies\">\\n',\n",
87
       " '##ALT=<ID=CN13,Description=\"Copy number allele: 13 copies\">\\n',\n",
88
       " '##ALT=<ID=CN14,Description=\"Copy number allele: 14 copies\">\\n',\n",
89
       " '##ALT=<ID=CN15,Description=\"Copy number allele: 15 copies\">\\n',\n",
90
       " '##ALT=<ID=CN16,Description=\"Copy number allele: 16 copies\">\\n',\n",
91
       " '##ALT=<ID=CN17,Description=\"Copy number allele: 17 copies\">\\n',\n",
92
       " '##ALT=<ID=CN18,Description=\"Copy number allele: 18 copies\">\\n',\n",
93
       " '##ALT=<ID=CN19,Description=\"Copy number allele: 19 copies\">\\n',\n",
94
       " '##ALT=<ID=CN2,Description=\"Copy number allele: 2 copies\">\\n',\n",
95
       " '##ALT=<ID=CN20,Description=\"Copy number allele: 20 copies\">\\n',\n",
96
       " '##ALT=<ID=CN21,Description=\"Copy number allele: 21 copies\">\\n',\n",
97
       " '##ALT=<ID=CN22,Description=\"Copy number allele: 22 copies\">\\n',\n",
98
       " '##ALT=<ID=CN23,Description=\"Copy number allele: 23 copies\">\\n',\n",
99
       " '##ALT=<ID=CN24,Description=\"Copy number allele: 24 copies\">\\n',\n",
100
       " '##ALT=<ID=CN25,Description=\"Copy number allele: 25 copies\">\\n',\n",
101
       " '##ALT=<ID=CN26,Description=\"Copy number allele: 26 copies\">\\n',\n",
102
       " '##ALT=<ID=CN27,Description=\"Copy number allele: 27 copies\">\\n',\n",
103
       " '##ALT=<ID=CN28,Description=\"Copy number allele: 28 copies\">\\n',\n",
104
       " '##ALT=<ID=CN29,Description=\"Copy number allele: 29 copies\">\\n',\n",
105
       " '##ALT=<ID=CN3,Description=\"Copy number allele: 3 copies\">\\n',\n",
106
       " '##ALT=<ID=CN30,Description=\"Copy number allele: 30 copies\">\\n',\n",
107
       " '##ALT=<ID=CN31,Description=\"Copy number allele: 31 copies\">\\n',\n",
108
       " '##ALT=<ID=CN32,Description=\"Copy number allele: 32 copies\">\\n',\n",
109
       " '##ALT=<ID=CN33,Description=\"Copy number allele: 33 copies\">\\n',\n",
110
       " '##ALT=<ID=CN34,Description=\"Copy number allele: 34 copies\">\\n',\n",
111
       " '##ALT=<ID=CN35,Description=\"Copy number allele: 35 copies\">\\n',\n",
112
       " '##ALT=<ID=CN36,Description=\"Copy number allele: 36 copies\">\\n',\n",
113
       " '##ALT=<ID=CN37,Description=\"Copy number allele: 37 copies\">\\n',\n",
114
       " '##ALT=<ID=CN38,Description=\"Copy number allele: 38 copies\">\\n',\n",
115
       " '##ALT=<ID=CN39,Description=\"Copy number allele: 39 copies\">\\n',\n",
116
       " '##ALT=<ID=CN4,Description=\"Copy number allele: 4 copies\">\\n',\n",
117
       " '##ALT=<ID=CN40,Description=\"Copy number allele: 40 copies\">\\n',\n",
118
       " '##ALT=<ID=CN41,Description=\"Copy number allele: 41 copies\">\\n',\n",
119
       " '##ALT=<ID=CN42,Description=\"Copy number allele: 42 copies\">\\n',\n",
120
       " '##ALT=<ID=CN43,Description=\"Copy number allele: 43 copies\">\\n',\n",
121
       " '##ALT=<ID=CN44,Description=\"Copy number allele: 44 copies\">\\n',\n",
122
       " '##ALT=<ID=CN45,Description=\"Copy number allele: 45 copies\">\\n',\n",
123
       " '##ALT=<ID=CN46,Description=\"Copy number allele: 46 copies\">\\n',\n",
124
       " '##ALT=<ID=CN47,Description=\"Copy number allele: 47 copies\">\\n',\n",
125
       " '##ALT=<ID=CN48,Description=\"Copy number allele: 48 copies\">\\n',\n",
126
       " '##ALT=<ID=CN49,Description=\"Copy number allele: 49 copies\">\\n',\n",
127
       " '##ALT=<ID=CN5,Description=\"Copy number allele: 5 copies\">\\n',\n",
128
       " '##ALT=<ID=CN50,Description=\"Copy number allele: 50 copies\">\\n',\n",
129
       " '##ALT=<ID=CN51,Description=\"Copy number allele: 51 copies\">\\n',\n",
130
       " '##ALT=<ID=CN52,Description=\"Copy number allele: 52 copies\">\\n',\n",
131
       " '##ALT=<ID=CN53,Description=\"Copy number allele: 53 copies\">\\n',\n",
132
       " '##ALT=<ID=CN54,Description=\"Copy number allele: 54 copies\">\\n',\n",
133
       " '##ALT=<ID=CN55,Description=\"Copy number allele: 55 copies\">\\n',\n",
134
       " '##ALT=<ID=CN56,Description=\"Copy number allele: 56 copies\">\\n',\n",
135
       " '##ALT=<ID=CN57,Description=\"Copy number allele: 57 copies\">\\n',\n",
136
       " '##ALT=<ID=CN58,Description=\"Copy number allele: 58 copies\">\\n',\n",
137
       " '##ALT=<ID=CN59,Description=\"Copy number allele: 59 copies\">\\n',\n",
138
       " '##ALT=<ID=CN6,Description=\"Copy number allele: 6 copies\">\\n',\n",
139
       " '##ALT=<ID=CN60,Description=\"Copy number allele: 60 copies\">\\n',\n",
140
       " '##ALT=<ID=CN61,Description=\"Copy number allele: 61 copies\">\\n',\n",
141
       " '##ALT=<ID=CN62,Description=\"Copy number allele: 62 copies\">\\n',\n",
142
       " '##ALT=<ID=CN63,Description=\"Copy number allele: 63 copies\">\\n',\n",
143
       " '##ALT=<ID=CN64,Description=\"Copy number allele: 64 copies\">\\n',\n",
144
       " '##ALT=<ID=CN65,Description=\"Copy number allele: 65 copies\">\\n',\n",
145
       " '##ALT=<ID=CN66,Description=\"Copy number allele: 66 copies\">\\n',\n",
146
       " '##ALT=<ID=CN67,Description=\"Copy number allele: 67 copies\">\\n',\n",
147
       " '##ALT=<ID=CN68,Description=\"Copy number allele: 68 copies\">\\n',\n",
148
       " '##ALT=<ID=CN69,Description=\"Copy number allele: 69 copies\">\\n',\n",
149
       " '##ALT=<ID=CN7,Description=\"Copy number allele: 7 copies\">\\n',\n",
150
       " '##ALT=<ID=CN70,Description=\"Copy number allele: 70 copies\">\\n',\n",
151
       " '##ALT=<ID=CN71,Description=\"Copy number allele: 71 copies\">\\n',\n",
152
       " '##ALT=<ID=CN72,Description=\"Copy number allele: 72 copies\">\\n',\n",
153
       " '##ALT=<ID=CN73,Description=\"Copy number allele: 73 copies\">\\n',\n",
154
       " '##ALT=<ID=CN74,Description=\"Copy number allele: 74 copies\">\\n',\n",
155
       " '##ALT=<ID=CN75,Description=\"Copy number allele: 75 copies\">\\n',\n",
156
       " '##ALT=<ID=CN76,Description=\"Copy number allele: 76 copies\">\\n',\n",
157
       " '##ALT=<ID=CN77,Description=\"Copy number allele: 77 copies\">\\n',\n",
158
       " '##ALT=<ID=CN78,Description=\"Copy number allele: 78 copies\">\\n',\n",
159
       " '##ALT=<ID=CN79,Description=\"Copy number allele: 79 copies\">\\n',\n",
160
       " '##ALT=<ID=CN8,Description=\"Copy number allele: 8 copies\">\\n',\n",
161
       " '##ALT=<ID=CN80,Description=\"Copy number allele: 80 copies\">\\n',\n",
162
       " '##ALT=<ID=CN81,Description=\"Copy number allele: 81 copies\">\\n',\n",
163
       " '##ALT=<ID=CN82,Description=\"Copy number allele: 82 copies\">\\n',\n",
164
       " '##ALT=<ID=CN83,Description=\"Copy number allele: 83 copies\">\\n',\n",
165
       " '##ALT=<ID=CN84,Description=\"Copy number allele: 84 copies\">\\n',\n",
166
       " '##ALT=<ID=CN85,Description=\"Copy number allele: 85 copies\">\\n',\n",
167
       " '##ALT=<ID=CN86,Description=\"Copy number allele: 86 copies\">\\n',\n",
168
       " '##ALT=<ID=CN87,Description=\"Copy number allele: 87 copies\">\\n',\n",
169
       " '##ALT=<ID=CN88,Description=\"Copy number allele: 88 copies\">\\n',\n",
170
       " '##ALT=<ID=CN89,Description=\"Copy number allele: 89 copies\">\\n',\n",
171
       " '##ALT=<ID=CN9,Description=\"Copy number allele: 9 copies\">\\n',\n",
172
       " '##ALT=<ID=CN90,Description=\"Copy number allele: 90 copies\">\\n',\n",
173
       " '##ALT=<ID=CN91,Description=\"Copy number allele: 91 copies\">\\n',\n",
174
       " '##ALT=<ID=CN92,Description=\"Copy number allele: 92 copies\">\\n',\n",
175
       " '##ALT=<ID=CN93,Description=\"Copy number allele: 93 copies\">\\n',\n",
176
       " '##ALT=<ID=CN94,Description=\"Copy number allele: 94 copies\">\\n',\n",
177
       " '##ALT=<ID=CN95,Description=\"Copy number allele: 95 copies\">\\n',\n",
178
       " '##ALT=<ID=CN96,Description=\"Copy number allele: 96 copies\">\\n',\n",
179
       " '##ALT=<ID=CN97,Description=\"Copy number allele: 97 copies\">\\n',\n",
180
       " '##ALT=<ID=CN98,Description=\"Copy number allele: 98 copies\">\\n',\n",
181
       " '##ALT=<ID=CN99,Description=\"Copy number allele: 99 copies\">\\n',\n",
182
       " '##ALT=<ID=CNV,Description=\"Copy Number Polymorphism\">\\n',\n",
183
       " '##ALT=<ID=DEL,Description=\"Deletion\">\\n',\n",
184
       " '##ALT=<ID=DUP,Description=\"Duplication\">\\n',\n",
185
       " '##ALT=<ID=INS:ME:ALU,Description=\"Insertion of ALU element\">\\n',\n",
186
       " '##ALT=<ID=INS:ME:LINE1,Description=\"Insertion of LINE1 element\">\\n',\n",
187
       " '##ALT=<ID=INS:ME:SVA,Description=\"Insertion of SVA element\">\\n',\n",
188
       " '##ALT=<ID=INS:MT,Description=\"Nuclear Mitochondrial Insertion\">\\n',\n",
189
       " '##ALT=<ID=INV,Description=\"Inversion\">\\n',\n",
190
       " '##FILTER=<ID=PASS,Description=\"All filters passed\">\\n',\n",
191
       " '##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\\n',\n",
192
       " '##GATKCommandLine=<ID=SelectVariants,CommandLine=\"SelectVariants  --output ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes_LDsubset.vcf --keep-ids gwas_sv_ld_filt_af_RSIDonly.list --variant ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf  --invertSelect false --exclude-non-variants false --exclude-filtered false --preserve-alleles false --remove-unused-alternates false --restrict-alleles-to ALL --keep-original-ac false --keep-original-dp false --mendelian-violation false --invert-mendelian-violation false --mendelian-violation-qual-threshold 0.0 --select-random-fraction 0.0 --remove-fraction-genotypes 0.0 --fully-decode false --max-indel-size 2147483647 --min-indel-size 0 --max-filtered-genotypes 2147483647 --min-filtered-genotypes 0 --max-fraction-filtered-genotypes 1.0 --min-fraction-filtered-genotypes 0.0 --max-nocall-number 2147483647 --max-nocall-fraction 1.0 --set-filtered-gt-to-nocall false --allow-nonoverlapping-command-line-samples false --suppress-reference-path false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays  --disable-tool-default-read-filters false\",Version=\"4.1.2.0\",Date=\"July 2, 2019 12:12:58 PM EDT\">\\n',\n",
193
       " '##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele. Format: AA|REF|ALT|IndelType. AA: Ancestral allele, REF:Reference Allele, ALT:Alternate Allele, IndelType:Type of Indel (REF, ALT and IndelType are only defined for indels)\">\\n',\n",
194
       " '##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes, for each ALT allele, in the same order as listed\">\\n',\n",
195
       " '##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">\\n',\n",
196
       " '##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency, for each ALT allele, in the same order as listed\">\\n',\n",
197
       " '##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1)\">\\n',\n",
198
       " '##INFO=<ID=AFR_AF,Number=A,Type=Float,Description=\"Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)\">\\n',\n",
199
       " '##INFO=<ID=AMR_AF,Number=A,Type=Float,Description=\"Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)\">\\n',\n",
200
       " '##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">\\n',\n",
201
       " '##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">\\n',\n",
202
       " '##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">\\n',\n",
203
       " '##INFO=<ID=CS,Number=1,Type=String,Description=\"Source call set.\">\\n',\n",
204
       " '##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read depth; some reads may have been filtered\">\\n',\n",
205
       " '##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total read depth; only low coverage data were counted towards the DP, exome data were not used\">\\n',\n",
206
       " '##INFO=<ID=EAS_AF,Number=A,Type=Float,Description=\"Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)\">\\n',\n",
207
       " '##INFO=<ID=END,Number=1,Type=Integer,Description=\"End coordinate of this variant\">\\n',\n",
208
       " '##INFO=<ID=EUR_AF,Number=A,Type=Float,Description=\"Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)\">\\n',\n",
209
       " '##INFO=<ID=EX_TARGET,Number=0,Type=Flag,Description=\"indicates whether a variant is within the exon pull down target boundaries\">\\n',\n",
210
       " '##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variation\">\\n',\n",
211
       " '##INFO=<ID=MC,Number=.,Type=String,Description=\"Merged calls.\">\\n',\n",
212
       " '##INFO=<ID=MEINFO,Number=4,Type=String,Description=\"Mobile element info of the form NAME,START,END<POLARITY; If there is only 5\\' OR 3\\' support for this call, will be NULL NULL for START and END\">\\n',\n",
213
       " '##INFO=<ID=MEND,Number=1,Type=Integer,Description=\"Mitochondrial end coordinate of inserted sequence\">\\n',\n",
214
       " '##INFO=<ID=MLEN,Number=1,Type=Integer,Description=\"Estimated length of mitochondrial insert\">\\n',\n",
215
       " '##INFO=<ID=MSTART,Number=1,Type=Integer,Description=\"Mitochondrial start coordinate of inserted sequence\">\\n',\n",
216
       " '##INFO=<ID=MULTI_ALLELIC,Number=0,Type=Flag,Description=\"indicates whether a site is multi-allelic\">\\n',\n",
217
       " '##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">\\n',\n",
218
       " '##INFO=<ID=SAS_AF,Number=A,Type=Float,Description=\"Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)\">\\n',\n",
219
       " '##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"SV length. It is only calculated for structural variation MEIs. For other types of SVs; one may calculate the SV length by INFO:END-START+1, or by finding the difference between lengthes of REF and ALT alleles\">\\n',\n",
220
       " '##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\\n',\n",
221
       " '##INFO=<ID=TSD,Number=1,Type=String,Description=\"Precise Target Site Duplication for bases, if unknown, value will be NULL\">\\n',\n",
222
       " '##INFO=<ID=VT,Number=.,Type=String,Description=\"indicates what type of variant the line represents\">\\n',\n",
223
       " '##contig=<ID=1,length=249250621>\\n',\n",
224
       " '##contig=<ID=2,length=243199373>\\n',\n",
225
       " '##contig=<ID=3,length=198022430>\\n',\n",
226
       " '##contig=<ID=4,length=191154276>\\n',\n",
227
       " '##contig=<ID=5,length=180915260>\\n',\n",
228
       " '##contig=<ID=6,length=171115067>\\n',\n",
229
       " '##contig=<ID=7,length=159138663>\\n',\n",
230
       " '##contig=<ID=8,length=146364022>\\n',\n",
231
       " '##contig=<ID=9,length=141213431>\\n',\n",
232
       " '##contig=<ID=10,length=135534747>\\n',\n",
233
       " '##contig=<ID=11,length=135006516>\\n',\n",
234
       " '##contig=<ID=12,length=133851895>\\n',\n",
235
       " '##contig=<ID=13,length=115169878>\\n',\n",
236
       " '##contig=<ID=14,length=107349540>\\n',\n",
237
       " '##contig=<ID=15,length=102531392>\\n',\n",
238
       " '##contig=<ID=16,length=90354753>\\n',\n",
239
       " '##contig=<ID=17,length=81195210>\\n',\n",
240
       " '##contig=<ID=18,length=78077248>\\n',\n",
241
       " '##contig=<ID=19,length=59128983>\\n',\n",
242
       " '##contig=<ID=20,length=63025520>\\n',\n",
243
       " '##contig=<ID=21,length=48129895>\\n',\n",
244
       " '##contig=<ID=22,length=51304566>\\n',\n",
245
       " '##contig=<ID=GL000191.1,length=106433>\\n',\n",
246
       " '##contig=<ID=GL000192.1,length=547496>\\n',\n",
247
       " '##contig=<ID=GL000193.1,length=189789>\\n',\n",
248
       " '##contig=<ID=GL000194.1,length=191469>\\n',\n",
249
       " '##contig=<ID=GL000195.1,length=182896>\\n',\n",
250
       " '##contig=<ID=GL000196.1,length=38914>\\n',\n",
251
       " '##contig=<ID=GL000197.1,length=37175>\\n',\n",
252
       " '##contig=<ID=GL000198.1,length=90085>\\n',\n",
253
       " '##contig=<ID=GL000199.1,length=169874>\\n',\n",
254
       " '##contig=<ID=GL000200.1,length=187035>\\n',\n",
255
       " '##contig=<ID=GL000201.1,length=36148>\\n',\n",
256
       " '##contig=<ID=GL000202.1,length=40103>\\n',\n",
257
       " '##contig=<ID=GL000203.1,length=37498>\\n',\n",
258
       " '##contig=<ID=GL000204.1,length=81310>\\n',\n",
259
       " '##contig=<ID=GL000205.1,length=174588>\\n',\n",
260
       " '##contig=<ID=GL000206.1,length=41001>\\n',\n",
261
       " '##contig=<ID=GL000207.1,length=4262>\\n',\n",
262
       " '##contig=<ID=GL000208.1,length=92689>\\n',\n",
263
       " '##contig=<ID=GL000209.1,length=159169>\\n',\n",
264
       " '##contig=<ID=GL000210.1,length=27682>\\n',\n",
265
       " '##contig=<ID=GL000211.1,length=166566>\\n',\n",
266
       " '##contig=<ID=GL000212.1,length=186858>\\n',\n",
267
       " '##contig=<ID=GL000213.1,length=164239>\\n',\n",
268
       " '##contig=<ID=GL000214.1,length=137718>\\n',\n",
269
       " '##contig=<ID=GL000215.1,length=172545>\\n',\n",
270
       " '##contig=<ID=GL000216.1,length=172294>\\n',\n",
271
       " '##contig=<ID=GL000217.1,length=172149>\\n',\n",
272
       " '##contig=<ID=GL000218.1,length=161147>\\n',\n",
273
       " '##contig=<ID=GL000219.1,length=179198>\\n',\n",
274
       " '##contig=<ID=GL000220.1,length=161802>\\n',\n",
275
       " '##contig=<ID=GL000221.1,length=155397>\\n',\n",
276
       " '##contig=<ID=GL000222.1,length=186861>\\n',\n",
277
       " '##contig=<ID=GL000223.1,length=180455>\\n',\n",
278
       " '##contig=<ID=GL000224.1,length=179693>\\n',\n",
279
       " '##contig=<ID=GL000225.1,length=211173>\\n',\n",
280
       " '##contig=<ID=GL000226.1,length=15008>\\n',\n",
281
       " '##contig=<ID=GL000227.1,length=128374>\\n',\n",
282
       " '##contig=<ID=GL000228.1,length=129120>\\n',\n",
283
       " '##contig=<ID=GL000229.1,length=19913>\\n',\n",
284
       " '##contig=<ID=GL000230.1,length=43691>\\n',\n",
285
       " '##contig=<ID=GL000231.1,length=27386>\\n',\n",
286
       " '##contig=<ID=GL000232.1,length=40652>\\n',\n",
287
       " '##contig=<ID=GL000233.1,length=45941>\\n',\n",
288
       " '##contig=<ID=GL000234.1,length=40531>\\n',\n",
289
       " '##contig=<ID=GL000235.1,length=34474>\\n',\n",
290
       " '##contig=<ID=GL000236.1,length=41934>\\n',\n",
291
       " '##contig=<ID=GL000237.1,length=45867>\\n',\n",
292
       " '##contig=<ID=GL000238.1,length=39939>\\n',\n",
293
       " '##contig=<ID=GL000239.1,length=33824>\\n',\n",
294
       " '##contig=<ID=GL000240.1,length=41933>\\n',\n",
295
       " '##contig=<ID=GL000241.1,length=42152>\\n',\n",
296
       " '##contig=<ID=GL000242.1,length=43523>\\n',\n",
297
       " '##contig=<ID=GL000243.1,length=43341>\\n',\n",
298
       " '##contig=<ID=GL000244.1,length=39929>\\n',\n",
299
       " '##contig=<ID=GL000245.1,length=36651>\\n',\n",
300
       " '##contig=<ID=GL000246.1,length=38154>\\n',\n",
301
       " '##contig=<ID=GL000247.1,length=36422>\\n',\n",
302
       " '##contig=<ID=GL000248.1,length=39786>\\n',\n",
303
       " '##contig=<ID=GL000249.1,length=38502>\\n',\n",
304
       " '##contig=<ID=MT,length=16569>\\n',\n",
305
       " '##contig=<ID=NC_007605,length=171823>\\n',\n",
306
       " '##contig=<ID=X,length=155270560>\\n',\n",
307
       " '##contig=<ID=Y,length=59373566>\\n',\n",
308
       " '##contig=<ID=hs37d5,length=35477943>\\n',\n",
309
       " '##fileDate=20150218\\n',\n",
310
       " '##source=1000GenomesPhase3Pipeline\\n',\n",
311
       " '##source=SelectVariants\\n',\n",
312
       " '#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tHG00096\\tHG00097\\tHG00099\\tHG00100\\tHG00101\\tHG00102\\tHG00103\\tHG00105\\tHG00106\\tHG00107\\tHG00108\\tHG00109\\tHG00110\\tHG00111\\tHG00112\\tHG00113\\tHG00114\\tHG00115\\tHG00116\\tHG00117\\tHG00118\\tHG00119\\tHG00120\\tHG00121\\tHG00122\\tHG00123\\tHG00125\\tHG00126\\tHG00127\\tHG00128\\tHG00129\\tHG00130\\tHG00131\\tHG00132\\tHG00133\\tHG00136\\tHG00137\\tHG00138\\tHG00139\\tHG00140\\tHG00141\\tHG00142\\tHG00143\\tHG00145\\tHG00146\\tHG00148\\tHG00149\\tHG00150\\tHG00151\\tHG00154\\tHG00155\\tHG00157\\tHG00158\\tHG00159\\tHG00160\\tHG00171\\tHG00173\\tHG00174\\tHG00176\\tHG00177\\tHG00178\\tHG00179\\tHG00180\\tHG00181\\tHG00182\\tHG00183\\tHG00185\\tHG00186\\tHG00187\\tHG00188\\tHG00189\\tHG00190\\tHG00231\\tHG00232\\tHG00233\\tHG00234\\tHG00235\\tHG00236\\tHG00237\\tHG00238\\tHG00239\\tHG00240\\tHG00242\\tHG00243\\tHG00244\\tHG00245\\tHG00246\\tHG00250\\tHG00251\\tHG00252\\tHG00253\\tHG00254\\tHG00255\\tHG00256\\tHG00257\\tHG00258\\tHG00259\\tHG00260\\tHG00261\\tHG00262\\tHG00263\\tHG00264\\tHG00265\\tHG00266\\tHG00267\\tHG00268\\tHG00269\\tHG00271\\tHG00272\\tHG00273\\tHG00274\\tHG00275\\tHG00276\\tHG00277\\tHG00278\\tHG00280\\tHG00281\\tHG00282\\tHG00284\\tHG00285\\tHG00288\\tHG00290\\tHG00304\\tHG00306\\tHG00308\\tHG00309\\tHG00310\\tHG00311\\tHG00313\\tHG00315\\tHG00318\\tHG00319\\tHG00320\\tHG00321\\tHG00323\\tHG00324\\tHG00325\\tHG00326\\tHG00327\\tHG00328\\tHG00329\\tHG00330\\tHG00331\\tHG00332\\tHG00334\\tHG00335\\tHG00336\\tHG00337\\tHG00338\\tHG00339\\tHG00341\\tHG00342\\tHG00343\\tHG00344\\tHG00345\\tHG00346\\tHG00349\\tHG00350\\tHG00351\\tHG00353\\tHG00355\\tHG00356\\tHG00357\\tHG00358\\tHG00360\\tHG00361\\tHG00362\\tHG00364\\tHG00365\\tHG00366\\tHG00367\\tHG00368\\tHG00369\\tHG00371\\tHG00372\\tHG00373\\tHG00375\\tHG00376\\tHG00378\\tHG00379\\tHG00380\\tHG00381\\tHG00382\\tHG00383\\tHG00384\\tHG00403\\tHG00404\\tHG00406\\tHG00407\\tHG00409\\tHG00410\\tHG00419\\tHG00421\\tHG00422\\tHG00428\\tHG00436\\tHG00437\\tHG00442\\tHG00443\\tHG00445\\tHG00446\\tHG00448\\tHG00449\\tHG00451\\tHG00452\\tHG00457\\tHG00458\\tHG00463\\tHG00464\\tHG00472\\tHG00473\\tHG00475\\tHG00476\\tHG00478\\tHG00479\\tHG00500\\tHG00513\\tHG00524\\tHG00525\\tHG00530\\tHG00531\\tHG00533\\tHG00534\\tHG00536\\tHG00537\\tHG00542\\tHG00543\\tHG00551\\tHG00553\\tHG00554\\tHG00556\\tHG00557\\tHG00559\\tHG00560\\tHG00565\\tHG00566\\tHG00580\\tHG00581\\tHG00583\\tHG00584\\tHG00589\\tHG00590\\tHG00592\\tHG00593\\tHG00595\\tHG00596\\tHG00598\\tHG00599\\tHG00607\\tHG00608\\tHG00610\\tHG00611\\tHG00613\\tHG00614\\tHG00619\\tHG00620\\tHG00622\\tHG00623\\tHG00625\\tHG00626\\tHG00628\\tHG00629\\tHG00631\\tHG00632\\tHG00634\\tHG00637\\tHG00638\\tHG00640\\tHG00641\\tHG00650\\tHG00651\\tHG00653\\tHG00654\\tHG00656\\tHG00657\\tHG00662\\tHG00663\\tHG00671\\tHG00672\\tHG00674\\tHG00675\\tHG00683\\tHG00684\\tHG00689\\tHG00690\\tHG00692\\tHG00693\\tHG00698\\tHG00699\\tHG00701\\tHG00704\\tHG00705\\tHG00707\\tHG00708\\tHG00717\\tHG00728\\tHG00729\\tHG00731\\tHG00732\\tHG00734\\tHG00736\\tHG00737\\tHG00739\\tHG00740\\tHG00742\\tHG00743\\tHG00759\\tHG00766\\tHG00844\\tHG00851\\tHG00864\\tHG00867\\tHG00879\\tHG00881\\tHG00956\\tHG00978\\tHG00982\\tHG01028\\tHG01029\\tHG01031\\tHG01046\\tHG01047\\tHG01048\\tHG01049\\tHG01051\\tHG01052\\tHG01054\\tHG01055\\tHG01058\\tHG01060\\tHG01061\\tHG01063\\tHG01064\\tHG01066\\tHG01067\\tHG01069\\tHG01070\\tHG01072\\tHG01073\\tHG01075\\tHG01077\\tHG01079\\tHG01080\\tHG01082\\tHG01083\\tHG01085\\tHG01086\\tHG01088\\tHG01089\\tHG01092\\tHG01094\\tHG01095\\tHG01097\\tHG01098\\tHG01101\\tHG01102\\tHG01104\\tHG01105\\tHG01107\\tHG01108\\tHG01110\\tHG01111\\tHG01112\\tHG01113\\tHG01119\\tHG01121\\tHG01122\\tHG01124\\tHG01125\\tHG01130\\tHG01131\\tHG01133\\tHG01134\\tHG01136\\tHG01137\\tHG01139\\tHG01140\\tHG01142\\tHG01148\\tHG01149\\tHG01161\\tHG01162\\tHG01164\\tHG01167\\tHG01168\\tHG01170\\tHG01171\\tHG01173\\tHG01174\\tHG01176\\tHG01177\\tHG01182\\tHG01183\\tHG01187\\tHG01188\\tHG01190\\tHG01191\\tHG01197\\tHG01198\\tHG01200\\tHG01204\\tHG01205\\tHG01241\\tHG01242\\tHG01247\\tHG01248\\tHG01250\\tHG01251\\tHG01253\\tHG01254\\tHG01256\\tHG01257\\tHG01259\\tHG01260\\tHG01269\\tHG01271\\tHG01272\\tHG01275\\tHG01277\\tHG01280\\tHG01281\\tHG01284\\tHG01286\\tHG01302\\tHG01303\\tHG01305\\tHG01308\\tHG01311\\tHG01312\\tHG01323\\tHG01325\\tHG01326\\tHG01334\\tHG01341\\tHG01342\\tHG01344\\tHG01345\\tHG01348\\tHG01350\\tHG01351\\tHG01353\\tHG01354\\tHG01356\\tHG01357\\tHG01359\\tHG01360\\tHG01362\\tHG01363\\tHG01365\\tHG01366\\tHG01369\\tHG01372\\tHG01374\\tHG01375\\tHG01377\\tHG01378\\tHG01383\\tHG01384\\tHG01389\\tHG01390\\tHG01392\\tHG01393\\tHG01395\\tHG01396\\tHG01398\\tHG01402\\tHG01403\\tHG01405\\tHG01412\\tHG01413\\tHG01414\\tHG01431\\tHG01432\\tHG01435\\tHG01437\\tHG01438\\tHG01440\\tHG01441\\tHG01443\\tHG01444\\tHG01447\\tHG01455\\tHG01456\\tHG01459\\tHG01461\\tHG01462\\tHG01464\\tHG01465\\tHG01468\\tHG01474\\tHG01479\\tHG01485\\tHG01486\\tHG01488\\tHG01489\\tHG01491\\tHG01492\\tHG01494\\tHG01495\\tHG01497\\tHG01498\\tHG01500\\tHG01501\\tHG01503\\tHG01504\\tHG01506\\tHG01507\\tHG01509\\tHG01510\\tHG01512\\tHG01513\\tHG01515\\tHG01516\\tHG01518\\tHG01519\\tHG01521\\tHG01522\\tHG01524\\tHG01525\\tHG01527\\tHG01528\\tHG01530\\tHG01531\\tHG01536\\tHG01537\\tHG01550\\tHG01551\\tHG01556\\tHG01565\\tHG01566\\tHG01571\\tHG01572\\tHG01577\\tHG01578\\tHG01583\\tHG01586\\tHG01589\\tHG01593\\tHG01595\\tHG01596\\tHG01597\\tHG01598\\tHG01599\\tHG01600\\tHG01602\\tHG01603\\tHG01605\\tHG01606\\tHG01607\\tHG01608\\tHG01610\\tHG01612\\tHG01613\\tHG01615\\tHG01617\\tHG01618\\tHG01619\\tHG01620\\tHG01623\\tHG01624\\tHG01625\\tHG01626\\tHG01628\\tHG01630\\tHG01631\\tHG01632\\tHG01668\\tHG01669\\tHG01670\\tHG01672\\tHG01673\\tHG01675\\tHG01676\\tHG01678\\tHG01679\\tHG01680\\tHG01682\\tHG01684\\tHG01685\\tHG01686\\tHG01694\\tHG01695\\tHG01697\\tHG01699\\tHG01700\\tHG01702\\tHG01704\\tHG01705\\tHG01707\\tHG01708\\tHG01709\\tHG01710\\tHG01746\\tHG01747\\tHG01756\\tHG01757\\tHG01761\\tHG01762\\tHG01765\\tHG01766\\tHG01767\\tHG01768\\tHG01770\\tHG01771\\tHG01773\\tHG01775\\tHG01776\\tHG01777\\tHG01779\\tHG01781\\tHG01783\\tHG01784\\tHG01785\\tHG01786\\tHG01789\\tHG01790\\tHG01791\\tHG01794\\tHG01795\\tHG01796\\tHG01797\\tHG01798\\tHG01799\\tHG01800\\tHG01801\\tHG01802\\tHG01804\\tHG01805\\tHG01806\\tHG01807\\tHG01808\\tHG01809\\tHG01810\\tHG01811\\tHG01812\\tHG01813\\tHG01815\\tHG01816\\tHG01817\\tHG01840\\tHG01841\\tHG01842\\tHG01843\\tHG01844\\tHG01845\\tHG01846\\tHG01847\\tHG01848\\tHG01849\\tHG01850\\tHG01851\\tHG01852\\tHG01853\\tHG01855\\tHG01857\\tHG01858\\tHG01859\\tHG01860\\tHG01861\\tHG01862\\tHG01863\\tHG01864\\tHG01865\\tHG01866\\tHG01867\\tHG01868\\tHG01869\\tHG01870\\tHG01871\\tHG01872\\tHG01873\\tHG01874\\tHG01878\\tHG01879\\tHG01880\\tHG01882\\tHG01883\\tHG01885\\tHG01886\\tHG01889\\tHG01890\\tHG01892\\tHG01893\\tHG01894\\tHG01896\\tHG01912\\tHG01914\\tHG01915\\tHG01917\\tHG01918\\tHG01920\\tHG01921\\tHG01923\\tHG01924\\tHG01926\\tHG01927\\tHG01932\\tHG01933\\tHG01935\\tHG01936\\tHG01938\\tHG01939\\tHG01941\\tHG01942\\tHG01944\\tHG01945\\tHG01947\\tHG01948\\tHG01950\\tHG01951\\tHG01953\\tHG01954\\tHG01956\\tHG01958\\tHG01961\\tHG01965\\tHG01967\\tHG01968\\tHG01970\\tHG01971\\tHG01973\\tHG01974\\tHG01976\\tHG01977\\tHG01979\\tHG01980\\tHG01982\\tHG01985\\tHG01986\\tHG01988\\tHG01989\\tHG01990\\tHG01991\\tHG01992\\tHG01997\\tHG02002\\tHG02003\\tHG02006\\tHG02008\\tHG02009\\tHG02010\\tHG02012\\tHG02013\\tHG02014\\tHG02016\\tHG02017\\tHG02019\\tHG02020\\tHG02023\\tHG02025\\tHG02026\\tHG02028\\tHG02029\\tHG02031\\tHG02032\\tHG02035\\tHG02040\\tHG02047\\tHG02048\\tHG02049\\tHG02050\\tHG02051\\tHG02052\\tHG02053\\tHG02054\\tHG02057\\tHG02058\\tHG02060\\tHG02061\\tHG02064\\tHG02067\\tHG02069\\tHG02070\\tHG02072\\tHG02073\\tHG02075\\tHG02076\\tHG02078\\tHG02079\\tHG02081\\tHG02082\\tHG02084\\tHG02085\\tHG02086\\tHG02087\\tHG02088\\tHG02089\\tHG02090\\tHG02095\\tHG02102\\tHG02104\\tHG02105\\tHG02107\\tHG02108\\tHG02111\\tHG02113\\tHG02116\\tHG02121\\tHG02122\\tHG02127\\tHG02128\\tHG02130\\tHG02131\\tHG02133\\tHG02134\\tHG02136\\tHG02137\\tHG02138\\tHG02139\\tHG02140\\tHG02141\\tHG02142\\tHG02143\\tHG02144\\tHG02146\\tHG02147\\tHG02150\\tHG02151\\tHG02152\\tHG02153\\tHG02154\\tHG02155\\tHG02156\\tHG02164\\tHG02165\\tHG02166\\tHG02178\\tHG02179\\tHG02180\\tHG02181\\tHG02182\\tHG02184\\tHG02185\\tHG02186\\tHG02187\\tHG02188\\tHG02190\\tHG02215\\tHG02219\\tHG02220\\tHG02221\\tHG02223\\tHG02224\\tHG02230\\tHG02231\\tHG02232\\tHG02233\\tHG02235\\tHG02236\\tHG02238\\tHG02239\\tHG02250\\tHG02252\\tHG02253\\tHG02255\\tHG02256\\tHG02259\\tHG02260\\tHG02262\\tHG02265\\tHG02266\\tHG02271\\tHG02272\\tHG02274\\tHG02275\\tHG02277\\tHG02278\\tHG02281\\tHG02282\\tHG02283\\tHG02284\\tHG02285\\tHG02286\\tHG02291\\tHG02292\\tHG02298\\tHG02299\\tHG02301\\tHG02304\\tHG02307\\tHG02308\\tHG02309\\tHG02312\\tHG02314\\tHG02315\\tHG02317\\tHG02318\\tHG02322\\tHG02323\\tHG02325\\tHG02330\\tHG02332\\tHG02334\\tHG02337\\tHG02339\\tHG02343\\tHG02345\\tHG02348\\tHG02351\\tHG02353\\tHG02355\\tHG02356\\tHG02360\\tHG02364\\tHG02367\\tHG02371\\tHG02373\\tHG02374\\tHG02375\\tHG02379\\tHG02380\\tHG02382\\tHG02383\\tHG02384\\tHG02385\\tHG02386\\tHG02389\\tHG02390\\tHG02391\\tHG02392\\tHG02394\\tHG02395\\tHG02396\\tHG02397\\tHG02398\\tHG02399\\tHG02401\\tHG02402\\tHG02406\\tHG02407\\tHG02408\\tHG02409\\tHG02410\\tHG02419\\tHG02420\\tHG02425\\tHG02427\\tHG02429\\tHG02433\\tHG02439\\tHG02442\\tHG02445\\tHG02449\\tHG02450\\tHG02455\\tHG02461\\tHG02462\\tHG02464\\tHG02465\\tHG02470\\tHG02471\\tHG02476\\tHG02477\\tHG02479\\tHG02481\\tHG02484\\tHG02485\\tHG02489\\tHG02490\\tHG02491\\tHG02493\\tHG02494\\tHG02496\\tHG02497\\tHG02501\\tHG02502\\tHG02505\\tHG02508\\tHG02511\\tHG02512\\tHG02513\\tHG02521\\tHG02522\\tHG02536\\tHG02537\\tHG02541\\tHG02545\\tHG02546\\tHG02549\\tHG02554\\tHG02555\\tHG02557\\tHG02558\\tHG02561\\tHG02562\\tHG02568\\tHG02570\\tHG02571\\tHG02573\\tHG02574\\tHG02577\\tHG02580\\tHG02582\\tHG02583\\tHG02585\\tHG02586\\tHG02588\\tHG02589\\tHG02594\\tHG02595\\tHG02597\\tHG02600\\tHG02601\\tHG02603\\tHG02604\\tHG02610\\tHG02611\\tHG02613\\tHG02614\\tHG02620\\tHG02621\\tHG02623\\tHG02624\\tHG02628\\tHG02629\\tHG02634\\tHG02635\\tHG02642\\tHG02643\\tHG02645\\tHG02646\\tHG02648\\tHG02649\\tHG02651\\tHG02652\\tHG02654\\tHG02655\\tHG02657\\tHG02658\\tHG02660\\tHG02661\\tHG02666\\tHG02667\\tHG02675\\tHG02676\\tHG02678\\tHG02679\\tHG02681\\tHG02682\\tHG02684\\tHG02685\\tHG02687\\tHG02688\\tHG02690\\tHG02691\\tHG02694\\tHG02696\\tHG02697\\tHG02699\\tHG02700\\tHG02702\\tHG02703\\tHG02715\\tHG02716\\tHG02721\\tHG02722\\tHG02724\\tHG02725\\tHG02727\\tHG02728\\tHG02731\\tHG02733\\tHG02734\\tHG02736\\tHG02737\\tHG02756\\tHG02757\\tHG02759\\tHG02760\\tHG02763\\tHG02768\\tHG02769\\tHG02771\\tHG02772\\tHG02774\\tHG02775\\tHG02778\\tHG02780\\tHG02783\\tHG02784\\tHG02786\\tHG02787\\tHG02789\\tHG02790\\tHG02792\\tHG02793\\tHG02798\\tHG02799\\tHG02804\\tHG02805\\tHG02807\\tHG02808\\tHG02810\\tHG02811\\tHG02813\\tHG02814\\tHG02816\\tHG02817\\tHG02819\\tHG02820\\tHG02836\\tHG02837\\tHG02839\\tHG02840\\tHG02851\\tHG02852\\tHG02854\\tHG02855\\tHG02860\\tHG02861\\tHG02870\\tHG02878\\tHG02879\\tHG02881\\tHG02882\\tHG02884\\tHG02885\\tHG02887\\tHG02888\\tHG02890\\tHG02891\\tHG02895\\tHG02896\\tHG02922\\tHG02923\\tHG02938\\tHG02941\\tHG02943\\tHG02944\\tHG02946\\tHG02947\\tHG02952\\tHG02953\\tHG02968\\tHG02970\\tHG02971\\tHG02973\\tHG02974\\tHG02976\\tHG02977\\tHG02979\\tHG02981\\tHG02982\\tHG02983\\tHG03006\\tHG03007\\tHG03009\\tHG03012\\tHG03015\\tHG03016\\tHG03018\\tHG03019\\tHG03021\\tHG03022\\tHG03024\\tHG03025\\tHG03027\\tHG03028\\tHG03039\\tHG03040\\tHG03045\\tHG03046\\tHG03048\\tHG03049\\tHG03052\\tHG03054\\tHG03055\\tHG03057\\tHG03058\\tHG03060\\tHG03061\\tHG03063\\tHG03064\\tHG03066\\tHG03069\\tHG03072\\tHG03073\\tHG03074\\tHG03077\\tHG03078\\tHG03079\\tHG03081\\tHG03082\\tHG03084\\tHG03085\\tHG03086\\tHG03088\\tHG03091\\tHG03095\\tHG03096\\tHG03097\\tHG03099\\tHG03100\\tHG03103\\tHG03105\\tHG03108\\tHG03109\\tHG03111\\tHG03112\\tHG03114\\tHG03115\\tHG03117\\tHG03118\\tHG03120\\tHG03121\\tHG03123\\tHG03124\\tHG03126\\tHG03127\\tHG03129\\tHG03130\\tHG03132\\tHG03133\\tHG03135\\tHG03136\\tHG03139\\tHG03157\\tHG03159\\tHG03160\\tHG03162\\tHG03163\\tHG03166\\tHG03168\\tHG03169\\tHG03172\\tHG03175\\tHG03189\\tHG03190\\tHG03193\\tHG03195\\tHG03196\\tHG03198\\tHG03199\\tHG03202\\tHG03209\\tHG03212\\tHG03224\\tHG03225\\tHG03228\\tHG03229\\tHG03234\\tHG03235\\tHG03237\\tHG03238\\tHG03240\\tHG03241\\tHG03246\\tHG03247\\tHG03258\\tHG03259\\tHG03265\\tHG03267\\tHG03268\\tHG03270\\tHG03271\\tHG03279\\tHG03280\\tHG03291\\tHG03294\\tHG03295\\tHG03297\\tHG03298\\tHG03300\\tHG03301\\tHG03303\\tHG03304\\tHG03311\\tHG03313\\tHG03342\\tHG03343\\tHG03351\\tHG03352\\tHG03354\\tHG03363\\tHG03366\\tHG03367\\tHG03369\\tHG03370\\tHG03372\\tHG03376\\tHG03378\\tHG03380\\tHG03382\\tHG03385\\tHG03388\\tHG03391\\tHG03394\\tHG03397\\tHG03401\\tHG03410\\tHG03419\\tHG03428\\tHG03432\\tHG03433\\tHG03436\\tHG03437\\tHG03439\\tHG03442\\tHG03445\\tHG03446\\tHG03449\\tHG03451\\tHG03452\\tHG03455\\tHG03457\\tHG03458\\tHG03460\\tHG03461\\tHG03464\\tHG03469\\tHG03470\\tHG03472\\tHG03473\\tHG03476\\tHG03478\\tHG03479\\tHG03484\\tHG03485\\tHG03488\\tHG03490\\tHG03491\\tHG03499\\tHG03511\\tHG03514\\tHG03515\\tHG03517\\tHG03518\\tHG03520\\tHG03521\\tHG03538\\tHG03539\\tHG03547\\tHG03548\\tHG03556\\tHG03557\\tHG03558\\tHG03559\\tHG03563\\tHG03565\\tHG03567\\tHG03571\\tHG03572\\tHG03575\\tHG03577\\tHG03578\\tHG03583\\tHG03585\\tHG03589\\tHG03593\\tHG03594\\tHG03595\\tHG03598\\tHG03600\\tHG03603\\tHG03604\\tHG03607\\tHG03611\\tHG03615\\tHG03616\\tHG03619\\tHG03624\\tHG03625\\tHG03629\\tHG03631\\tHG03634\\tHG03636\\tHG03640\\tHG03642\\tHG03643\\tHG03644\\tHG03645\\tHG03646\\tHG03649\\tHG03652\\tHG03653\\tHG03660\\tHG03663\\tHG03667\\tHG03668\\tHG03672\\tHG03673\\tHG03679\\tHG03680\\tHG03681\\tHG03684\\tHG03685\\tHG03686\\tHG03687\\tHG03689\\tHG03690\\tHG03691\\tHG03692\\tHG03693\\tHG03694\\tHG03695\\tHG03696\\tHG03697\\tHG03698\\tHG03702\\tHG03703\\tHG03705\\tHG03706\\tHG03708\\tHG03709\\tHG03711\\tHG03713\\tHG03714\\tHG03716\\tHG03717\\tHG03718\\tHG03720\\tHG03722\\tHG03727\\tHG03729\\tHG03730\\tHG03731\\tHG03733\\tHG03736\\tHG03738\\tHG03740\\tHG03741\\tHG03742\\tHG03743\\tHG03744\\tHG03745\\tHG03746\\tHG03750\\tHG03752\\tHG03753\\tHG03754\\tHG03755\\tHG03756\\tHG03757\\tHG03760\\tHG03762\\tHG03765\\tHG03767\\tHG03770\\tHG03771\\tHG03772\\tHG03773\\tHG03774\\tHG03775\\tHG03777\\tHG03778\\tHG03779\\tHG03780\\tHG03781\\tHG03782\\tHG03784\\tHG03785\\tHG03786\\tHG03787\\tHG03788\\tHG03789\\tHG03790\\tHG03792\\tHG03793\\tHG03796\\tHG03800\\tHG03802\\tHG03803\\tHG03805\\tHG03808\\tHG03809\\tHG03812\\tHG03814\\tHG03815\\tHG03817\\tHG03821\\tHG03823\\tHG03824\\tHG03826\\tHG03829\\tHG03830\\tHG03832\\tHG03833\\tHG03836\\tHG03837\\tHG03838\\tHG03844\\tHG03846\\tHG03848\\tHG03849\\tHG03850\\tHG03851\\tHG03854\\tHG03856\\tHG03857\\tHG03858\\tHG03861\\tHG03862\\tHG03863\\tHG03864\\tHG03866\\tHG03867\\tHG03868\\tHG03869\\tHG03870\\tHG03871\\tHG03872\\tHG03873\\tHG03874\\tHG03875\\tHG03882\\tHG03884\\tHG03885\\tHG03886\\tHG03887\\tHG03888\\tHG03890\\tHG03894\\tHG03895\\tHG03896\\tHG03897\\tHG03898\\tHG03899\\tHG03900\\tHG03902\\tHG03905\\tHG03907\\tHG03908\\tHG03910\\tHG03911\\tHG03913\\tHG03914\\tHG03916\\tHG03917\\tHG03919\\tHG03920\\tHG03922\\tHG03925\\tHG03926\\tHG03928\\tHG03931\\tHG03934\\tHG03937\\tHG03940\\tHG03941\\tHG03943\\tHG03944\\tHG03945\\tHG03947\\tHG03949\\tHG03950\\tHG03951\\tHG03953\\tHG03955\\tHG03960\\tHG03963\\tHG03965\\tHG03967\\tHG03968\\tHG03969\\tHG03971\\tHG03973\\tHG03974\\tHG03976\\tHG03977\\tHG03978\\tHG03985\\tHG03986\\tHG03989\\tHG03990\\tHG03991\\tHG03995\\tHG03998\\tHG03999\\tHG04001\\tHG04002\\tHG04003\\tHG04006\\tHG04014\\tHG04015\\tHG04017\\tHG04018\\tHG04019\\tHG04020\\tHG04022\\tHG04023\\tHG04025\\tHG04026\\tHG04029\\tHG04033\\tHG04035\\tHG04038\\tHG04039\\tHG04042\\tHG04047\\tHG04054\\tHG04056\\tHG04059\\tHG04060\\tHG04061\\tHG04062\\tHG04063\\tHG04070\\tHG04075\\tHG04076\\tHG04080\\tHG04090\\tHG04093\\tHG04094\\tHG04096\\tHG04098\\tHG04099\\tHG04100\\tHG04106\\tHG04107\\tHG04118\\tHG04131\\tHG04134\\tHG04140\\tHG04141\\tHG04144\\tHG04146\\tHG04152\\tHG04153\\tHG04155\\tHG04156\\tHG04158\\tHG04159\\tHG04161\\tHG04162\\tHG04164\\tHG04171\\tHG04173\\tHG04176\\tHG04177\\tHG04180\\tHG04182\\tHG04183\\tHG04185\\tHG04186\\tHG04188\\tHG04189\\tHG04194\\tHG04195\\tHG04198\\tHG04200\\tHG04202\\tHG04206\\tHG04209\\tHG04210\\tHG04211\\tHG04212\\tHG04214\\tHG04216\\tHG04219\\tHG04222\\tHG04225\\tHG04227\\tHG04229\\tHG04235\\tHG04238\\tHG04239\\tNA06984\\tNA06985\\tNA06986\\tNA06989\\tNA06994\\tNA07000\\tNA07037\\tNA07048\\tNA07051\\tNA07056\\tNA07347\\tNA07357\\tNA10847\\tNA10851\\tNA11829\\tNA11830\\tNA11831\\tNA11832\\tNA11840\\tNA11843\\tNA11881\\tNA11892\\tNA11893\\tNA11894\\tNA11918\\tNA11919\\tNA11920\\tNA11930\\tNA11931\\tNA11932\\tNA11933\\tNA11992\\tNA11994\\tNA11995\\tNA12003\\tNA12004\\tNA12005\\tNA12006\\tNA12043\\tNA12044\\tNA12045\\tNA12046\\tNA12058\\tNA12144\\tNA12154\\tNA12155\\tNA12156\\tNA12234\\tNA12249\\tNA12272\\tNA12273\\tNA12275\\tNA12282\\tNA12283\\tNA12286\\tNA12287\\tNA12340\\tNA12341\\tNA12342\\tNA12347\\tNA12348\\tNA12383\\tNA12399\\tNA12400\\tNA12413\\tNA12414\\tNA12489\\tNA12546\\tNA12716\\tNA12717\\tNA12718\\tNA12748\\tNA12749\\tNA12750\\tNA12751\\tNA12760\\tNA12761\\tNA12762\\tNA12763\\tNA12775\\tNA12776\\tNA12777\\tNA12778\\tNA12812\\tNA12813\\tNA12814\\tNA12815\\tNA12827\\tNA12828\\tNA12829\\tNA12830\\tNA12842\\tNA12843\\tNA12872\\tNA12873\\tNA12874\\tNA12878\\tNA12889\\tNA12890\\tNA18486\\tNA18488\\tNA18489\\tNA18498\\tNA18499\\tNA18501\\tNA18502\\tNA18504\\tNA18505\\tNA18507\\tNA18508\\tNA18510\\tNA18511\\tNA18516\\tNA18517\\tNA18519\\tNA18520\\tNA18522\\tNA18523\\tNA18525\\tNA18526\\tNA18528\\tNA18530\\tNA18531\\tNA18532\\tNA18533\\tNA18534\\tNA18535\\tNA18536\\tNA18537\\tNA18538\\tNA18539\\tNA18541\\tNA18542\\tNA18543\\tNA18544\\tNA18545\\tNA18546\\tNA18547\\tNA18548\\tNA18549\\tNA18550\\tNA18552\\tNA18553\\tNA18555\\tNA18557\\tNA18558\\tNA18559\\tNA18560\\tNA18561\\tNA18562\\tNA18563\\tNA18564\\tNA18565\\tNA18566\\tNA18567\\tNA18570\\tNA18571\\tNA18572\\tNA18573\\tNA18574\\tNA18577\\tNA18579\\tNA18582\\tNA18591\\tNA18592\\tNA18593\\tNA18595\\tNA18596\\tNA18597\\tNA18599\\tNA18602\\tNA18603\\tNA18605\\tNA18606\\tNA18608\\tNA18609\\tNA18610\\tNA18611\\tNA18612\\tNA18613\\tNA18614\\tNA18615\\tNA18616\\tNA18617\\tNA18618\\tNA18619\\tNA18620\\tNA18621\\tNA18622\\tNA18623\\tNA18624\\tNA18625\\tNA18626\\tNA18627\\tNA18628\\tNA18629\\tNA18630\\tNA18631\\tNA18632\\tNA18633\\tNA18634\\tNA18635\\tNA18636\\tNA18637\\tNA18638\\tNA18639\\tNA18640\\tNA18641\\tNA18642\\tNA18643\\tNA18644\\tNA18645\\tNA18646\\tNA18647\\tNA18648\\tNA18740\\tNA18745\\tNA18747\\tNA18748\\tNA18749\\tNA18757\\tNA18853\\tNA18856\\tNA18858\\tNA18861\\tNA18864\\tNA18865\\tNA18867\\tNA18868\\tNA18870\\tNA18871\\tNA18873\\tNA18874\\tNA18876\\tNA18877\\tNA18878\\tNA18879\\tNA18881\\tNA18907\\tNA18908\\tNA18909\\tNA18910\\tNA18912\\tNA18915\\tNA18916\\tNA18917\\tNA18923\\tNA18924\\tNA18933\\tNA18934\\tNA18939\\tNA18940\\tNA18941\\tNA18942\\tNA18943\\tNA18944\\tNA18945\\tNA18946\\tNA18947\\tNA18948\\tNA18949\\tNA18950\\tNA18951\\tNA18952\\tNA18953\\tNA18954\\tNA18956\\tNA18957\\tNA18959\\tNA18960\\tNA18961\\tNA18962\\tNA18963\\tNA18964\\tNA18965\\tNA18966\\tNA18967\\tNA18968\\tNA18969\\tNA18970\\tNA18971\\tNA18972\\tNA18973\\tNA18974\\tNA18975\\tNA18976\\tNA18977\\tNA18978\\tNA18979\\tNA18980\\tNA18981\\tNA18982\\tNA18983\\tNA18984\\tNA18985\\tNA18986\\tNA18987\\tNA18988\\tNA18989\\tNA18990\\tNA18991\\tNA18992\\tNA18993\\tNA18994\\tNA18995\\tNA18997\\tNA18998\\tNA18999\\tNA19000\\tNA19001\\tNA19002\\tNA19003\\tNA19004\\tNA19005\\tNA19006\\tNA19007\\tNA19009\\tNA19010\\tNA19011\\tNA19012\\tNA19017\\tNA19019\\tNA19020\\tNA19023\\tNA19024\\tNA19025\\tNA19026\\tNA19027\\tNA19028\\tNA19030\\tNA19031\\tNA19035\\tNA19036\\tNA19037\\tNA19038\\tNA19041\\tNA19042\\tNA19043\\tNA19054\\tNA19055\\tNA19056\\tNA19057\\tNA19058\\tNA19059\\tNA19060\\tNA19062\\tNA19063\\tNA19064\\tNA19065\\tNA19066\\tNA19067\\tNA19068\\tNA19070\\tNA19072\\tNA19074\\tNA19075\\tNA19076\\tNA19077\\tNA19078\\tNA19079\\tNA19080\\tNA19081\\tNA19082\\tNA19083\\tNA19084\\tNA19085\\tNA19086\\tNA19087\\tNA19088\\tNA19089\\tNA19090\\tNA19091\\tNA19092\\tNA19093\\tNA19095\\tNA19096\\tNA19098\\tNA19099\\tNA19102\\tNA19107\\tNA19108\\tNA19113\\tNA19114\\tNA19116\\tNA19117\\tNA19118\\tNA19119\\tNA19121\\tNA19129\\tNA19130\\tNA19131\\tNA19137\\tNA19138\\tNA19141\\tNA19143\\tNA19144\\tNA19146\\tNA19147\\tNA19149\\tNA19152\\tNA19153\\tNA19159\\tNA19160\\tNA19171\\tNA19172\\tNA19175\\tNA19184\\tNA19185\\tNA19189\\tNA19190\\tNA19197\\tNA19198\\tNA19200\\tNA19201\\tNA19204\\tNA19206\\tNA19207\\tNA19209\\tNA19210\\tNA19213\\tNA19214\\tNA19222\\tNA19223\\tNA19225\\tNA19235\\tNA19236\\tNA19238\\tNA19239\\tNA19247\\tNA19248\\tNA19256\\tNA19257\\tNA19307\\tNA19308\\tNA19309\\tNA19310\\tNA19312\\tNA19314\\tNA19315\\tNA19316\\tNA19317\\tNA19318\\tNA19319\\tNA19320\\tNA19321\\tNA19323\\tNA19324\\tNA19327\\tNA19328\\tNA19331\\tNA19332\\tNA19334\\tNA19338\\tNA19346\\tNA19347\\tNA19350\\tNA19351\\tNA19355\\tNA19360\\tNA19372\\tNA19374\\tNA19375\\tNA19376\\tNA19377\\tNA19378\\tNA19379\\tNA19380\\tNA19383\\tNA19384\\tNA19385\\tNA19390\\tNA19391\\tNA19393\\tNA19394\\tNA19395\\tNA19397\\tNA19399\\tNA19401\\tNA19403\\tNA19404\\tNA19428\\tNA19429\\tNA19430\\tNA19431\\tNA19434\\tNA19435\\tNA19436\\tNA19437\\tNA19438\\tNA19439\\tNA19440\\tNA19443\\tNA19445\\tNA19446\\tNA19448\\tNA19449\\tNA19451\\tNA19452\\tNA19454\\tNA19455\\tNA19456\\tNA19457\\tNA19461\\tNA19462\\tNA19463\\tNA19466\\tNA19467\\tNA19468\\tNA19471\\tNA19472\\tNA19473\\tNA19474\\tNA19475\\tNA19625\\tNA19648\\tNA19649\\tNA19651\\tNA19652\\tNA19654\\tNA19655\\tNA19657\\tNA19658\\tNA19661\\tNA19663\\tNA19664\\tNA19669\\tNA19670\\tNA19676\\tNA19678\\tNA19679\\tNA19681\\tNA19682\\tNA19684\\tNA19700\\tNA19701\\tNA19703\\tNA19704\\tNA19707\\tNA19711\\tNA19712\\tNA19713\\tNA19716\\tNA19717\\tNA19719\\tNA19720\\tNA19722\\tNA19723\\tNA19725\\tNA19726\\tNA19728\\tNA19729\\tNA19731\\tNA19732\\tNA19734\\tNA19735\\tNA19740\\tNA19741\\tNA19746\\tNA19747\\tNA19749\\tNA19750\\tNA19752\\tNA19755\\tNA19756\\tNA19758\\tNA19759\\tNA19761\\tNA19762\\tNA19764\\tNA19770\\tNA19771\\tNA19773\\tNA19774\\tNA19776\\tNA19777\\tNA19779\\tNA19780\\tNA19782\\tNA19783\\tNA19785\\tNA19786\\tNA19788\\tNA19789\\tNA19792\\tNA19794\\tNA19795\\tNA19818\\tNA19819\\tNA19834\\tNA19835\\tNA19900\\tNA19901\\tNA19904\\tNA19908\\tNA19909\\tNA19913\\tNA19914\\tNA19916\\tNA19917\\tNA19920\\tNA19921\\tNA19922\\tNA19923\\tNA19982\\tNA19984\\tNA20126\\tNA20127\\tNA20274\\tNA20276\\tNA20278\\tNA20281\\tNA20282\\tNA20287\\tNA20289\\tNA20291\\tNA20294\\tNA20296\\tNA20298\\tNA20299\\tNA20314\\tNA20317\\tNA20318\\tNA20320\\tNA20321\\tNA20332\\tNA20334\\tNA20339\\tNA20340\\tNA20342\\tNA20346\\tNA20348\\tNA20351\\tNA20355\\tNA20356\\tNA20357\\tNA20359\\tNA20362\\tNA20412\\tNA20502\\tNA20503\\tNA20504\\tNA20505\\tNA20506\\tNA20507\\tNA20508\\tNA20509\\tNA20510\\tNA20511\\tNA20512\\tNA20513\\tNA20514\\tNA20515\\tNA20516\\tNA20517\\tNA20518\\tNA20519\\tNA20520\\tNA20521\\tNA20522\\tNA20524\\tNA20525\\tNA20527\\tNA20528\\tNA20529\\tNA20530\\tNA20531\\tNA20532\\tNA20533\\tNA20534\\tNA20535\\tNA20536\\tNA20538\\tNA20539\\tNA20540\\tNA20541\\tNA20542\\tNA20543\\tNA20544\\tNA20581\\tNA20582\\tNA20585\\tNA20586\\tNA20587\\tNA20588\\tNA20589\\tNA20752\\tNA20753\\tNA20754\\tNA20755\\tNA20756\\tNA20757\\tNA20758\\tNA20759\\tNA20760\\tNA20761\\tNA20762\\tNA20763\\tNA20764\\tNA20765\\tNA20766\\tNA20767\\tNA20768\\tNA20769\\tNA20770\\tNA20771\\tNA20772\\tNA20773\\tNA20774\\tNA20775\\tNA20778\\tNA20783\\tNA20785\\tNA20786\\tNA20787\\tNA20790\\tNA20792\\tNA20795\\tNA20796\\tNA20797\\tNA20798\\tNA20799\\tNA20800\\tNA20801\\tNA20802\\tNA20803\\tNA20804\\tNA20805\\tNA20806\\tNA20807\\tNA20808\\tNA20809\\tNA20810\\tNA20811\\tNA20812\\tNA20813\\tNA20814\\tNA20815\\tNA20818\\tNA20819\\tNA20821\\tNA20822\\tNA20826\\tNA20827\\tNA20828\\tNA20832\\tNA20845\\tNA20846\\tNA20847\\tNA20849\\tNA20850\\tNA20851\\tNA20852\\tNA20853\\tNA20854\\tNA20856\\tNA20858\\tNA20859\\tNA20861\\tNA20862\\tNA20863\\tNA20864\\tNA20866\\tNA20867\\tNA20868\\tNA20869\\tNA20870\\tNA20872\\tNA20874\\tNA20875\\tNA20876\\tNA20877\\tNA20878\\tNA20881\\tNA20882\\tNA20884\\tNA20885\\tNA20886\\tNA20887\\tNA20888\\tNA20889\\tNA20890\\tNA20891\\tNA20892\\tNA20894\\tNA20895\\tNA20896\\tNA20897\\tNA20899\\tNA20900\\tNA20901\\tNA20902\\tNA20903\\tNA20904\\tNA20905\\tNA20906\\tNA20908\\tNA20910\\tNA20911\\tNA21086\\tNA21087\\tNA21088\\tNA21089\\tNA21090\\tNA21091\\tNA21092\\tNA21093\\tNA21094\\tNA21095\\tNA21097\\tNA21098\\tNA21099\\tNA21100\\tNA21101\\tNA21102\\tNA21103\\tNA21104\\tNA21105\\tNA21106\\tNA21107\\tNA21108\\tNA21109\\tNA21110\\tNA21111\\tNA21112\\tNA21113\\tNA21114\\tNA21115\\tNA21116\\tNA21117\\tNA21118\\tNA21119\\tNA21120\\tNA21122\\tNA21123\\tNA21124\\tNA21125\\tNA21126\\tNA21127\\tNA21128\\tNA21129\\tNA21130\\tNA21133\\tNA21135\\tNA21137\\tNA21141\\tNA21142\\tNA21143\\tNA21144\\n']"
313
      ]
314
     },
315
     "execution_count": 21,
316
     "metadata": {},
317
     "output_type": "execute_result"
318
    }
319
   ],
320
   "source": [
321
    "meta"
322
   ]
323
  },
324
  {
325
   "cell_type": "code",
326
   "execution_count": 2,
327
   "metadata": {},
328
   "outputs": [],
329
   "source": [
330
    "data = pd.read_csv(\"/Users/judydu/Desktop/AI4All/data/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes_LDsubset.vcf\",\n",
331
    "                   sep='\\t', comment = \"#\",\n",
332
    "                   skiprows = 251, index_col = False, header = None)"
333
   ]
334
  },
335
  {
336
   "cell_type": "code",
337
   "execution_count": 19,
338
   "metadata": {},
339
   "outputs": [
340
    {
341
     "data": {
342
      "text/plain": [
343
       "(1135, 2513)"
344
      ]
345
     },
346
     "execution_count": 19,
347
     "metadata": {},
348
     "output_type": "execute_result"
349
    }
350
   ],
351
   "source": [
352
    "data.shape"
353
   ]
354
  },
355
  {
356
   "cell_type": "code",
357
   "execution_count": 22,
358
   "metadata": {},
359
   "outputs": [],
360
   "source": [
361
    "data.columns = meta[len(meta)-1].split(\"\\t\")"
362
   ]
363
  },
364
  {
365
   "cell_type": "code",
366
   "execution_count": 23,
367
   "metadata": {
368
    "scrolled": false
369
   },
370
   "outputs": [
371
    {
372
     "data": {
373
      "text/html": [
374
       "<div>\n",
375
       "<style scoped>\n",
376
       "    .dataframe tbody tr th:only-of-type {\n",
377
       "        vertical-align: middle;\n",
378
       "    }\n",
379
       "\n",
380
       "    .dataframe tbody tr th {\n",
381
       "        vertical-align: top;\n",
382
       "    }\n",
383
       "\n",
384
       "    .dataframe thead th {\n",
385
       "        text-align: right;\n",
386
       "    }\n",
387
       "</style>\n",
388
       "<table border=\"1\" class=\"dataframe\">\n",
389
       "  <thead>\n",
390
       "    <tr style=\"text-align: right;\">\n",
391
       "      <th></th>\n",
392
       "      <th>#CHROM</th>\n",
393
       "      <th>POS</th>\n",
394
       "      <th>ID</th>\n",
395
       "      <th>REF</th>\n",
396
       "      <th>ALT</th>\n",
397
       "      <th>QUAL</th>\n",
398
       "      <th>FILTER</th>\n",
399
       "      <th>INFO</th>\n",
400
       "      <th>FORMAT</th>\n",
401
       "      <th>HG00096</th>\n",
402
       "      <th>...</th>\n",
403
       "      <th>NA21128</th>\n",
404
       "      <th>NA21129</th>\n",
405
       "      <th>NA21130</th>\n",
406
       "      <th>NA21133</th>\n",
407
       "      <th>NA21135</th>\n",
408
       "      <th>NA21137</th>\n",
409
       "      <th>NA21141</th>\n",
410
       "      <th>NA21142</th>\n",
411
       "      <th>NA21143</th>\n",
412
       "      <th>NA21144</th>\n",
413
       "    </tr>\n",
414
       "  </thead>\n",
415
       "  <tbody>\n",
416
       "    <tr>\n",
417
       "      <th>0</th>\n",
418
       "      <td>1</td>\n",
419
       "      <td>1005806</td>\n",
420
       "      <td>rs3934834</td>\n",
421
       "      <td>C</td>\n",
422
       "      <td>T</td>\n",
423
       "      <td>100</td>\n",
424
       "      <td>PASS</td>\n",
425
       "      <td>AA=C|||;AC=1119;AF=0.223442;AFR_AF=0.3941;AMR_...</td>\n",
426
       "      <td>GT</td>\n",
427
       "      <td>0|0</td>\n",
428
       "      <td>...</td>\n",
429
       "      <td>0|0</td>\n",
430
       "      <td>0|0</td>\n",
431
       "      <td>0|1</td>\n",
432
       "      <td>0|1</td>\n",
433
       "      <td>0|0</td>\n",
434
       "      <td>1|0</td>\n",
435
       "      <td>1|0</td>\n",
436
       "      <td>0|0</td>\n",
437
       "      <td>0|0</td>\n",
438
       "      <td>1|0</td>\n",
439
       "    </tr>\n",
440
       "    <tr>\n",
441
       "      <th>1</th>\n",
442
       "      <td>1</td>\n",
443
       "      <td>1079198</td>\n",
444
       "      <td>rs11260603</td>\n",
445
       "      <td>T</td>\n",
446
       "      <td>C</td>\n",
447
       "      <td>100</td>\n",
448
       "      <td>PASS</td>\n",
449
       "      <td>AA=c|||;AC=1520;AF=0.303514;AFR_AF=0.6271;AMR_...</td>\n",
450
       "      <td>GT</td>\n",
451
       "      <td>0|1</td>\n",
452
       "      <td>...</td>\n",
453
       "      <td>0|0</td>\n",
454
       "      <td>0|0</td>\n",
455
       "      <td>0|0</td>\n",
456
       "      <td>0|0</td>\n",
457
       "      <td>0|0</td>\n",
458
       "      <td>0|0</td>\n",
459
       "      <td>0|0</td>\n",
460
       "      <td>0|0</td>\n",
461
       "      <td>0|1</td>\n",
462
       "      <td>0|0</td>\n",
463
       "    </tr>\n",
464
       "    <tr>\n",
465
       "      <th>2</th>\n",
466
       "      <td>1</td>\n",
467
       "      <td>1247494</td>\n",
468
       "      <td>rs12103</td>\n",
469
       "      <td>T</td>\n",
470
       "      <td>C</td>\n",
471
       "      <td>100</td>\n",
472
       "      <td>PASS</td>\n",
473
       "      <td>AA=T|||;AC=1599;AF=0.319289;AFR_AF=0.0923;AMR_...</td>\n",
474
       "      <td>GT</td>\n",
475
       "      <td>1|0</td>\n",
476
       "      <td>...</td>\n",
477
       "      <td>1|0</td>\n",
478
       "      <td>1|0</td>\n",
479
       "      <td>0|0</td>\n",
480
       "      <td>0|0</td>\n",
481
       "      <td>1|1</td>\n",
482
       "      <td>0|1</td>\n",
483
       "      <td>0|0</td>\n",
484
       "      <td>0|0</td>\n",
485
       "      <td>0|0</td>\n",
486
       "      <td>0|1</td>\n",
487
       "    </tr>\n",
488
       "    <tr>\n",
489
       "      <th>3</th>\n",
490
       "      <td>1</td>\n",
491
       "      <td>2069172</td>\n",
492
       "      <td>rs425277</td>\n",
493
       "      <td>C</td>\n",
494
       "      <td>T</td>\n",
495
       "      <td>100</td>\n",
496
       "      <td>PASS</td>\n",
497
       "      <td>AA=C|||;AC=1128;AF=0.22524;AFR_AF=0.0666;AMR_A...</td>\n",
498
       "      <td>GT</td>\n",
499
       "      <td>1|0</td>\n",
500
       "      <td>...</td>\n",
501
       "      <td>0|0</td>\n",
502
       "      <td>1|0</td>\n",
503
       "      <td>1|0</td>\n",
504
       "      <td>0|0</td>\n",
505
       "      <td>0|1</td>\n",
506
       "      <td>0|0</td>\n",
507
       "      <td>0|1</td>\n",
508
       "      <td>0|0</td>\n",
509
       "      <td>0|1</td>\n",
510
       "      <td>0|0</td>\n",
511
       "    </tr>\n",
512
       "    <tr>\n",
513
       "      <th>4</th>\n",
514
       "      <td>1</td>\n",
515
       "      <td>2069681</td>\n",
516
       "      <td>rs3753242</td>\n",
517
       "      <td>C</td>\n",
518
       "      <td>T</td>\n",
519
       "      <td>100</td>\n",
520
       "      <td>PASS</td>\n",
521
       "      <td>AA=C|||;AC=943;AF=0.188299;AFR_AF=0.0197;AMR_A...</td>\n",
522
       "      <td>GT</td>\n",
523
       "      <td>0|0</td>\n",
524
       "      <td>...</td>\n",
525
       "      <td>0|1</td>\n",
526
       "      <td>0|0</td>\n",
527
       "      <td>0|0</td>\n",
528
       "      <td>0|0</td>\n",
529
       "      <td>1|0</td>\n",
530
       "      <td>0|0</td>\n",
531
       "      <td>1|0</td>\n",
532
       "      <td>0|1</td>\n",
533
       "      <td>0|0</td>\n",
534
       "      <td>0|0</td>\n",
535
       "    </tr>\n",
536
       "  </tbody>\n",
537
       "</table>\n",
538
       "<p>5 rows × 2513 columns</p>\n",
539
       "</div>"
540
      ],
541
      "text/plain": [
542
       "   #CHROM      POS          ID REF ALT  QUAL FILTER  \\\n",
543
       "0       1  1005806   rs3934834   C   T   100   PASS   \n",
544
       "1       1  1079198  rs11260603   T   C   100   PASS   \n",
545
       "2       1  1247494     rs12103   T   C   100   PASS   \n",
546
       "3       1  2069172    rs425277   C   T   100   PASS   \n",
547
       "4       1  2069681   rs3753242   C   T   100   PASS   \n",
548
       "\n",
549
       "                                                INFO FORMAT HG00096  ...  \\\n",
550
       "0  AA=C|||;AC=1119;AF=0.223442;AFR_AF=0.3941;AMR_...     GT     0|0  ...   \n",
551
       "1  AA=c|||;AC=1520;AF=0.303514;AFR_AF=0.6271;AMR_...     GT     0|1  ...   \n",
552
       "2  AA=T|||;AC=1599;AF=0.319289;AFR_AF=0.0923;AMR_...     GT     1|0  ...   \n",
553
       "3  AA=C|||;AC=1128;AF=0.22524;AFR_AF=0.0666;AMR_A...     GT     1|0  ...   \n",
554
       "4  AA=C|||;AC=943;AF=0.188299;AFR_AF=0.0197;AMR_A...     GT     0|0  ...   \n",
555
       "\n",
556
       "  NA21128 NA21129 NA21130 NA21133 NA21135 NA21137 NA21141 NA21142 NA21143  \\\n",
557
       "0     0|0     0|0     0|1     0|1     0|0     1|0     1|0     0|0     0|0   \n",
558
       "1     0|0     0|0     0|0     0|0     0|0     0|0     0|0     0|0     0|1   \n",
559
       "2     1|0     1|0     0|0     0|0     1|1     0|1     0|0     0|0     0|0   \n",
560
       "3     0|0     1|0     1|0     0|0     0|1     0|0     0|1     0|0     0|1   \n",
561
       "4     0|1     0|0     0|0     0|0     1|0     0|0     1|0     0|1     0|0   \n",
562
       "\n",
563
       "  NA21144\\n  \n",
564
       "0       1|0  \n",
565
       "1       0|0  \n",
566
       "2       0|1  \n",
567
       "3       0|0  \n",
568
       "4       0|0  \n",
569
       "\n",
570
       "[5 rows x 2513 columns]"
571
      ]
572
     },
573
     "execution_count": 23,
574
     "metadata": {},
575
     "output_type": "execute_result"
576
    }
577
   ],
578
   "source": [
579
    "data.head()"
580
   ]
581
  },
582
  {
583
   "cell_type": "code",
584
   "execution_count": 34,
585
   "metadata": {},
586
   "outputs": [],
587
   "source": [
588
    "trainingX = data.iloc[:,9:2513]"
589
   ]
590
  },
591
  {
592
   "cell_type": "markdown",
593
   "metadata": {},
594
   "source": [
595
    "## Split data randomly into training and testing sets\n",
596
    "- Later, we may want to consider balancing the training and testing sets as a function of y (geographical origin)"
597
   ]
598
  },
599
  {
600
   "cell_type": "code",
601
   "execution_count": null,
602
   "metadata": {},
603
   "outputs": [],
604
   "source": [
605
    "# Naive label-encoding of genotypes\n",
606
    "align X and Y\n",
607
    "label encode correct subset of X\n",
608
    "\n",
609
    "# subset filename for chromosome number\n",
610
    "chrom = path[chrom].split(\".\")[]"
611
   ]
612
  },
613
  {
614
   "cell_type": "code",
615
   "execution_count": null,
616
   "metadata": {},
617
   "outputs": [],
618
   "source": [
619
    "trainX = {}\n",
620
    "trainY = {}\n",
621
    "testX = {}\n",
622
    "testY = {}"
623
   ]
624
  },
625
  {
626
   "cell_type": "code",
627
   "execution_count": null,
628
   "metadata": {},
629
   "outputs": [],
630
   "source": [
631
    "## randomly split into training/testing\n",
632
    "train_test_split(test_size = .2, random_state = 7, shuffle = True)"
633
   ]
634
  },
635
  {
636
   "cell_type": "markdown",
637
   "metadata": {},
638
   "source": [
639
    "## feature selection via Chi-squared test of indep"
640
   ]
641
  },
642
  {
643
   "cell_type": "code",
644
   "execution_count": null,
645
   "metadata": {},
646
   "outputs": [],
647
   "source": [
648
    "chi2Statistic,pvals = chi2(trainX[chrom], trainY[chrom])\n",
649
    "trainX[chrom + \"_chi2\"] = trainX[chrom].filter()"
650
   ]
651
  },
652
  {
653
   "cell_type": "markdown",
654
   "metadata": {},
655
   "source": [
656
    "# Feature Selection and Data Processing\n",
657
    "1. Label encode discrete features not listed as continuous in metadata (chunk 1)\n",
658
    "2. Normalize cont'd features: mean center and divide by norm w.r.t. training samples (chunk 2)\n",
659
    "3. Mutual information regression feature selection (chunk 3-4)"
660
   ]
661
  },
662
  {
663
   "cell_type": "code",
664
   "execution_count": 35,
665
   "metadata": {},
666
   "outputs": [],
667
   "source": [
668
    "trainingX = trainingX.apply(LabelEncoder().fit_transform)\n",
669
    "\n",
670
    "#preprocessing.LabelEncoder().fit_transform(trainingX)"
671
   ]
672
  },
673
  {
674
   "cell_type": "code",
675
   "execution_count": 36,
676
   "metadata": {},
677
   "outputs": [
678
    {
679
     "data": {
680
      "text/html": [
681
       "<div>\n",
682
       "<style scoped>\n",
683
       "    .dataframe tbody tr th:only-of-type {\n",
684
       "        vertical-align: middle;\n",
685
       "    }\n",
686
       "\n",
687
       "    .dataframe tbody tr th {\n",
688
       "        vertical-align: top;\n",
689
       "    }\n",
690
       "\n",
691
       "    .dataframe thead th {\n",
692
       "        text-align: right;\n",
693
       "    }\n",
694
       "</style>\n",
695
       "<table border=\"1\" class=\"dataframe\">\n",
696
       "  <thead>\n",
697
       "    <tr style=\"text-align: right;\">\n",
698
       "      <th></th>\n",
699
       "      <th>HG00096</th>\n",
700
       "      <th>HG00097</th>\n",
701
       "      <th>HG00099</th>\n",
702
       "      <th>HG00100</th>\n",
703
       "      <th>HG00101</th>\n",
704
       "      <th>HG00102</th>\n",
705
       "      <th>HG00103</th>\n",
706
       "      <th>HG00105</th>\n",
707
       "      <th>HG00106</th>\n",
708
       "      <th>HG00107</th>\n",
709
       "      <th>...</th>\n",
710
       "      <th>NA21128</th>\n",
711
       "      <th>NA21129</th>\n",
712
       "      <th>NA21130</th>\n",
713
       "      <th>NA21133</th>\n",
714
       "      <th>NA21135</th>\n",
715
       "      <th>NA21137</th>\n",
716
       "      <th>NA21141</th>\n",
717
       "      <th>NA21142</th>\n",
718
       "      <th>NA21143</th>\n",
719
       "      <th>NA21144</th>\n",
720
       "    </tr>\n",
721
       "  </thead>\n",
722
       "  <tbody>\n",
723
       "    <tr>\n",
724
       "      <th>0</th>\n",
725
       "      <td>0</td>\n",
726
       "      <td>1</td>\n",
727
       "      <td>0</td>\n",
728
       "      <td>2</td>\n",
729
       "      <td>0</td>\n",
730
       "      <td>0</td>\n",
731
       "      <td>0</td>\n",
732
       "      <td>0</td>\n",
733
       "      <td>0</td>\n",
734
       "      <td>0</td>\n",
735
       "      <td>...</td>\n",
736
       "      <td>0</td>\n",
737
       "      <td>0</td>\n",
738
       "      <td>1</td>\n",
739
       "      <td>1</td>\n",
740
       "      <td>0</td>\n",
741
       "      <td>2</td>\n",
742
       "      <td>2</td>\n",
743
       "      <td>0</td>\n",
744
       "      <td>0</td>\n",
745
       "      <td>2</td>\n",
746
       "    </tr>\n",
747
       "    <tr>\n",
748
       "      <th>1</th>\n",
749
       "      <td>1</td>\n",
750
       "      <td>0</td>\n",
751
       "      <td>0</td>\n",
752
       "      <td>2</td>\n",
753
       "      <td>1</td>\n",
754
       "      <td>3</td>\n",
755
       "      <td>1</td>\n",
756
       "      <td>1</td>\n",
757
       "      <td>0</td>\n",
758
       "      <td>3</td>\n",
759
       "      <td>...</td>\n",
760
       "      <td>0</td>\n",
761
       "      <td>0</td>\n",
762
       "      <td>0</td>\n",
763
       "      <td>0</td>\n",
764
       "      <td>0</td>\n",
765
       "      <td>0</td>\n",
766
       "      <td>0</td>\n",
767
       "      <td>0</td>\n",
768
       "      <td>1</td>\n",
769
       "      <td>0</td>\n",
770
       "    </tr>\n",
771
       "    <tr>\n",
772
       "      <th>2</th>\n",
773
       "      <td>2</td>\n",
774
       "      <td>3</td>\n",
775
       "      <td>3</td>\n",
776
       "      <td>3</td>\n",
777
       "      <td>4</td>\n",
778
       "      <td>3</td>\n",
779
       "      <td>1</td>\n",
780
       "      <td>3</td>\n",
781
       "      <td>4</td>\n",
782
       "      <td>1</td>\n",
783
       "      <td>...</td>\n",
784
       "      <td>3</td>\n",
785
       "      <td>3</td>\n",
786
       "      <td>0</td>\n",
787
       "      <td>0</td>\n",
788
       "      <td>4</td>\n",
789
       "      <td>1</td>\n",
790
       "      <td>0</td>\n",
791
       "      <td>0</td>\n",
792
       "      <td>0</td>\n",
793
       "      <td>1</td>\n",
794
       "    </tr>\n",
795
       "    <tr>\n",
796
       "      <th>3</th>\n",
797
       "      <td>2</td>\n",
798
       "      <td>0</td>\n",
799
       "      <td>2</td>\n",
800
       "      <td>2</td>\n",
801
       "      <td>0</td>\n",
802
       "      <td>1</td>\n",
803
       "      <td>0</td>\n",
804
       "      <td>1</td>\n",
805
       "      <td>0</td>\n",
806
       "      <td>1</td>\n",
807
       "      <td>...</td>\n",
808
       "      <td>0</td>\n",
809
       "      <td>3</td>\n",
810
       "      <td>2</td>\n",
811
       "      <td>0</td>\n",
812
       "      <td>1</td>\n",
813
       "      <td>0</td>\n",
814
       "      <td>1</td>\n",
815
       "      <td>0</td>\n",
816
       "      <td>1</td>\n",
817
       "      <td>0</td>\n",
818
       "    </tr>\n",
819
       "    <tr>\n",
820
       "      <th>4</th>\n",
821
       "      <td>0</td>\n",
822
       "      <td>0</td>\n",
823
       "      <td>0</td>\n",
824
       "      <td>0</td>\n",
825
       "      <td>0</td>\n",
826
       "      <td>0</td>\n",
827
       "      <td>0</td>\n",
828
       "      <td>0</td>\n",
829
       "      <td>0</td>\n",
830
       "      <td>0</td>\n",
831
       "      <td>...</td>\n",
832
       "      <td>1</td>\n",
833
       "      <td>0</td>\n",
834
       "      <td>0</td>\n",
835
       "      <td>0</td>\n",
836
       "      <td>3</td>\n",
837
       "      <td>0</td>\n",
838
       "      <td>2</td>\n",
839
       "      <td>1</td>\n",
840
       "      <td>0</td>\n",
841
       "      <td>0</td>\n",
842
       "    </tr>\n",
843
       "  </tbody>\n",
844
       "</table>\n",
845
       "<p>5 rows × 2504 columns</p>\n",
846
       "</div>"
847
      ],
848
      "text/plain": [
849
       "   HG00096  HG00097  HG00099  HG00100  HG00101  HG00102  HG00103  HG00105  \\\n",
850
       "0        0        1        0        2        0        0        0        0   \n",
851
       "1        1        0        0        2        1        3        1        1   \n",
852
       "2        2        3        3        3        4        3        1        3   \n",
853
       "3        2        0        2        2        0        1        0        1   \n",
854
       "4        0        0        0        0        0        0        0        0   \n",
855
       "\n",
856
       "   HG00106  HG00107  ...  NA21128  NA21129  NA21130  NA21133  NA21135  \\\n",
857
       "0        0        0  ...        0        0        1        1        0   \n",
858
       "1        0        3  ...        0        0        0        0        0   \n",
859
       "2        4        1  ...        3        3        0        0        4   \n",
860
       "3        0        1  ...        0        3        2        0        1   \n",
861
       "4        0        0  ...        1        0        0        0        3   \n",
862
       "\n",
863
       "   NA21137  NA21141  NA21142  NA21143  NA21144\\n  \n",
864
       "0        2        2        0        0          2  \n",
865
       "1        0        0        0        1          0  \n",
866
       "2        1        0        0        0          1  \n",
867
       "3        0        1        0        1          0  \n",
868
       "4        0        2        1        0          0  \n",
869
       "\n",
870
       "[5 rows x 2504 columns]"
871
      ]
872
     },
873
     "execution_count": 36,
874
     "metadata": {},
875
     "output_type": "execute_result"
876
    }
877
   ],
878
   "source": [
879
    "trainingX.head()"
880
   ]
881
  },
882
  {
883
   "cell_type": "code",
884
   "execution_count": 373,
885
   "metadata": {},
886
   "outputs": [],
887
   "source": [
888
    "def MIfeatureSelector(trainingX, trainingY, responseString, meta, MIthreshold, testingX):\n",
889
    "    # drop featuresY that are NA values. We will not be imputing missing responses.\n",
890
    "    trainingY = trainingY.dropna(subset = [responseString])\n",
891
    "    trainingY = trainingY[[responseString]]\n",
892
    "    \n",
893
    "    #Match IDs of X and y\n",
894
    "    trainingX = trainingX.loc[trainingY.index,:]\n",
895
    "    \n",
896
    "    #Match challegeIDs of meta\n",
897
    "    meta = meta.loc[meta[\"old_name\"].isin(trainingX)]\n",
898
    "    trainingX = trainingX.loc[:,meta.old_name]\n",
899
    "    \n",
900
    "    #MI regression\n",
901
    "    mi = mutual_info_regression(X = trainingX,\n",
902
    "                            y = trainingY[responseString],\n",
903
    "                            discrete_features = list(meta[\"discrete\"]), \n",
904
    "                            random_state = 10)\n",
905
    "    trainingX = trainingX.loc[:, mi >= MIthreshold]\n",
906
    "    #Return non-NA training responses, training features with matching challengeIDs and appropriate MI\n",
907
    "\n",
908
    "    return trainingX, trainingY, testingX[trainingX.columns]"
909
   ]
910
  },
911
  {
912
   "cell_type": "markdown",
913
   "metadata": {},
914
   "source": [
915
    "# PCA, SVM"
916
   ]
917
  },
918
  {
919
   "cell_type": "code",
920
   "execution_count": null,
921
   "metadata": {},
922
   "outputs": [],
923
   "source": [
924
    "def fitPCA(X):\n",
925
    "    pca = PCA(n_components = True)\n",
926
    "    pca.fit(X)\n",
927
    "    #return pca.components_, pca.explained_variance_ratio_\n",
928
    "    return pca\n",
929
    "pca_chr = fitPCA(data.iloc[].transpose())"
930
   ]
931
  },
932
  {
933
   "cell_type": "code",
934
   "execution_count": null,
935
   "metadata": {},
936
   "outputs": [],
937
   "source": [
938
    "def fitSVM(X,Y, testX, testY):\n",
939
    "    # fit SVM using hinge loss, L2 penalty via SGD\n",
940
    "    svm = SGDClassifier(random_state = 7).fit(X,Y)\n",
941
    "    return svm, svm.coef_, svm.score(testX, testY)\n",
942
    "\n",
943
    "models = {}; featureWeights = {}; predictionScore = {}\n",
944
    "\n",
945
    "for key in trainXDict:\n",
946
    "    newkey = \"SVM_\" + key\n",
947
    "    print(newkey)\n",
948
    "    models[newkey], featureWeights[newkey], predictionScore[newkey] = fitSVM(\n",
949
    "        X = trainX[key].iloc[:,0],\n",
950
    "        Y = trainY[key],\n",
951
    "        testX = testX[key],\n",
952
    "        testY = testY[key]\n",
953
    "    )\n",
954
    "    models[\"PCA_\" + key] = fitPCA(X = trainX[key])"
955
   ]
956
  },
957
  {
958
   "cell_type": "markdown",
959
   "metadata": {},
960
   "source": [
961
    "# Analysis"
962
   ]
963
  },
964
  {
965
   "cell_type": "code",
966
   "execution_count": null,
967
   "metadata": {},
968
   "outputs": [],
969
   "source": [
970
    "plot histogram pca.explained_variance_ratio_\n",
971
    "plot pca.components_ 0 and 1 color by testing\n",
972
    "\n",
973
    "plot svm.score(testX, testY)\n",
974
    "plot coef name by top 10 svm.coef_ ordering by level \n",
975
    "variation score in chi squared test"
976
   ]
977
  }
978
 ],
979
 "metadata": {
980
  "kernelspec": {
981
   "display_name": "Python 3",
982
   "language": "python",
983
   "name": "python3"
984
  },
985
  "language_info": {
986
   "codemirror_mode": {
987
    "name": "ipython",
988
    "version": 3
989
   },
990
   "file_extension": ".py",
991
   "mimetype": "text/x-python",
992
   "name": "python",
993
   "nbconvert_exporter": "python",
994
   "pygments_lexer": "ipython3",
995
   "version": "3.7.3"
996
  }
997
 },
998
 "nbformat": 4,
999
 "nbformat_minor": 2
1000
}