--- a
+++ b/exploratory_analysis.ipynb
@@ -0,0 +1,257 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exploratory analysis\n",
+    "\n",
+    "This notebook includes some exploratory analysis on the data set for medical diagnosis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from bs4 import BeautifulSoup\n",
+    "import urllib\n",
+    "import pdb\n",
+    "import json\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "import pickle\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-\n",
+      "('Vocab size:', 21520, 'unique words')\n",
+      "('Story max length:', 1647, 'words')\n",
+      "('Number of training stories:', 133093)\n",
+      "('Number of test stories:', 59394)\n",
+      "-\n",
+      "Here's what a \"story\" tuple looks like (input, query, answer):\n",
+      "([u'barrett', u'esophagus', u'itself', u'does', u'not', u'cause', u'symptoms', u'.', u'many', u'people', u'with', u'this', u'condition', u'do', u'not', u'have', u'any', u'symptoms', u'.', u'the', u'acid', u'reflux', u'that', u'causes', u'barrett', u'esophagus', u'often', u'leads', u'to', u'symptoms', u'of', u'heartburn', u'.'], u'barrett esophagus')\n",
+      "-\n",
+      "Vectorizing the word sequences...\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_facts = pickle.load(open('data/training_set.dat','r'))\n",
+    "\n",
+    "test_facts = pickle.load(open('data/test_set.dat','r'))\n",
+    "\n",
+    "train_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in train_facts]\n",
+    "test_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in test_facts]\n",
+    "\n",
+    "\n",
+    "vocab = sorted(reduce(lambda x, y: x | y, (set(story + [answer]) for story, answer in train_stories + test_stories)))\n",
+    "story_vocab = sorted(reduce(lambda x, y: x | y, (set(story) for story, answer in train_stories + test_stories)))\n",
+    "\n",
+    "# Reserve 0 for masking via pad_sequences\n",
+    "vocab_size = len(vocab) + 1\n",
+    "story_maxlen = max(map(len, (x for x, _ in train_stories + test_stories)))\n",
+    "\n",
+    "\n",
+    "print('-')\n",
+    "print('Vocab size:', vocab_size, 'unique words')\n",
+    "print('Story max length:', story_maxlen, 'words')\n",
+    "print('Number of training stories:', len(train_stories))\n",
+    "print('Number of test stories:', len(test_stories))\n",
+    "print('-')\n",
+    "print('Here\\'s what a \"story\" tuple looks like (input, query, answer):')\n",
+    "print(train_stories[0])\n",
+    "print('-')\n",
+    "print('Vectorizing the word sequences...')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Answers dict len: 2350\n"
+     ]
+    }
+   ],
+   "source": [
+    "answer_vocab = sorted(reduce(lambda x, y: x | y, (set([answer]) for _, answer in train_stories + test_stories)))\n",
+    "# Reserve 0 for masking via pad_sequences\n",
+    "answer_dict = dict((word, i) for i, word in enumerate(answer_vocab))\n",
+    "print('Answers dict len: {0}'.format(len(answer_dict)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DescribeResult(nobs=192487, minmax=(4, 1647), mean=122.61276865450654, variance=24135.934935201523, skewness=3.780458033729767, kurtosis=19.974163240940545)\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEPCAYAAAC+35gCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAH5ZJREFUeJzt3XGQVWeZ5/HvL0SMQSBBGxghpJNNiMTKalBbY9RcJUNE\ntyBuVSIZx4Bhams2iVFnSg3ulEm77pCw60p2ssmWO5gAGhmCG2FmswmyzHXMSGySQMCA0E6EAJFm\nTQtqZqQAn/3jvMCh7abfhnv7dsPvU0Vx7nPf99znNN08fd7znvcoIjAzM8txVqMTMDOzwcNFw8zM\nsrlomJlZNhcNMzPL5qJhZmbZXDTMzCxbVtGQ9FlJP5a0UdK3JA2VdL6kVZK2SnpS0shS+7mS2iVt\nkTS1FJ+c9rFN0oJSfKikpanPWkkTanuYZmZWC70WDUlvAj4FTI6Ifw2cDdwE3AmsjojLgDXA3NT+\ncuBGYBIwDXhAktLuHgTmRMREYKKk61J8DtAZEZcCC4D5NTo+MzOrodzhqSHAMElnA68DdgMzgEXp\n/UXA9Wl7OrA0Ig5FxHagHWiRNBYYHhHrUrvFpT7lfS0Hppzc4ZiZWT31WjQi4mXgq8BLFMVif0Ss\nBsZEREdqswcYnbqMA3aWdrE7xcYBu0rxXSl2XJ+IOAzskzTqJI/JzMzqJGd46jyKM4ELgTdRnHF8\nHOi6/kgt1yNR703MzKy/nZ3R5lrgxYjoBJD0GPAeoEPSmIjoSENPe1P73cAFpf7jU6yneLnPy5KG\nACOOfF6ZJC+UZWZ2EiKiJr+M51zTeAl4t6Rz0gXtKcBmYCUwO7WZBaxI2yuBmWlG1EXAJUBbGsLa\nL6kl7efmLn1mpe0bKC6sdysiav7nVPZ911131SWnU/kzEHMaqHk5J+d0JuRVS72eaUREm6TlwHrg\nYPr768BwYJmkW4AdFDOmiIjNkpZRFJaDwK1xLOvbgIeBc4DHI+KJFF8ILJHUDrwCzKzN4ZmZWS3l\nDE8REa1Aa5dwJ8XQVXft5wHzuok/C1zRTfwAqeiYmdnA5TvCT1GlUml0Cr9nIOYEAzMv55THOeUb\nqHnVimo93lVPkqIe+R6593AwfS3MzHJJIvrxQriZmRmQeU3jdOczDDOzPD7TMDOzbC4aZmaWzUXD\nzMyyuWiYmVm2QXchfN68hVntPvaxD3DxxRfXORszszPLoCsa27ZN5Lzzmk/Y5uc/f453vWtHdtHw\nfRpmZnkGXdE499w3MnLkBSds88tf/hO1XandzMzA1zTMzKwPXDTMzCybi4aZmWVz0TAzs2yD7kJ4\nPXjWlJlZHp9pmJlZNhcNMzPL5qJhZmbZei0akiZKWi/pufT3fkl3SDpf0ipJWyU9KWlkqc9cSe2S\ntkiaWopPlrRR0jZJC0rxoZKWpj5rJU2o/aGamdmp6rVoRMS2iLgyIiYDbwdeBR4D7gRWR8RlwBpg\nLoCky4EbgUnANOABHVmnAx4E5kTERGCipOtSfA7QGRGXAguA+bU6QDMzq52+Dk9dC/xTROwEZgCL\nUnwRcH3ang4sjYhDEbEdaAdaJI0FhkfEutRucalPeV/LgSl9PZBTIYljdc3MzHrS16LxMeCRtD0m\nIjoAImIPMDrFxwE7S312p9g4YFcpvivFjusTEYeBfZJG9TE3MzOrs+yiIek1FGcRj6ZQ15sbanmz\ng3/tNzMbgPpyc9804NmI+EV63SFpTER0pKGnvSm+GygvQzs+xXqKl/u8LGkIMCIiOrtLoq3tfoYN\nawKgublCc3OlD4dgZnb6q1arVKvVuuy7L0XjJuDbpdcrgdnAvcAsYEUp/i1JX6MYdroEaIuISDOv\nWoB1wM3Afyv1mQX8CLiB4sJ6t1pabqepaVIf0jYzO7NUKhUqlcrR162trTXbd1bRkHQuxUXwf1cK\n3wssk3QLsINixhQRsVnSMmAzcBC4NY6t03Eb8DBwDvB4RDyR4guBJZLagVeAmadyUGZmVh9ZRSMi\n/hlo6hLrpCgk3bWfB8zrJv4scEU38QOkotMIXnvKzCyP7wg3M7NsLhpmZpbNRcPMzLK5aJiZWTYX\nDTMzy+aigdeeMjPL5aJhZmbZXDTMzCybi4aZmWVz0TAzs2wuGmZmlq0vq9yetrz2lJlZHp9pmJlZ\nNhcNMzPL5qJhZmbZXDTMzCybi4aZmWVz0cBrT5mZ5XLRMDOzbFlFQ9JISY9K2iLpBUnvknS+pFWS\ntkp6UtLIUvu5ktpT+6ml+GRJGyVtk7SgFB8qaWnqs1bShNoeppmZ1ULumcZ9wOMRMQl4K/AT4E5g\ndURcBqwB5gJIuhy4EZgETAMe0LGxnweBORExEZgo6boUnwN0RsSlwAJg/ikfmZmZ1VyvRUPSCOB9\nEfEQQEQcioj9wAxgUWq2CLg+bU8HlqZ224F2oEXSWGB4RKxL7RaX+pT3tRyYckpHZWZmdZFzpnER\n8AtJD0l6TtLXJZ0LjImIDoCI2AOMTu3HATtL/Xen2DhgVym+K8WO6xMRh4F9kkad5DGZmVmd5Kw9\ndTYwGbgtIp6R9DWKoamuCzbVcgGnHqcytbXdz7BhTQA0N1dobq6c8od57SkzO51Uq1Wq1Wpd9p1T\nNHYBOyPimfT6OxRFo0PSmIjoSENPe9P7u4ELSv3Hp1hP8XKflyUNAUZERGd3ybS03E5T06SMtM3M\nzkyVSoVKpXL0dWtra8323evwVBqC2ilpYgpNAV4AVgKzU2wWsCJtrwRmphlRFwGXAG1pCGu/pJZ0\nYfzmLn1mpe0bKC6sm5nZAJO7NPodwLckvQZ4EfgkMARYJukWYAfFjCkiYrOkZcBm4CBwaxwb/7kN\neBg4h2I21hMpvhBYIqkdeAWYeaoHZmZmtZdVNCLieeCd3bx1bQ/t5wHzuok/C1zRTfwAqeiYmdnA\n5TvCzcwsm4sGXnvKzCyXi4aZmWU7bZ8R/tBDK1i8+Pu9tpsw4bx+yMbM7PRw2haNvXv/hauuWtBr\nu+3b765/MmZmpwkPT5mZWTYXDTMzy3baDk/1hdeeMjPL4zMNMzPL5qJhZmbZXDTMzCybi4aZmWVz\n0TAzs2wuGnjtKTOzXC4aZmaWzUXDzMyyuWiYmVk2Fw0zM8vmomFmZtmyioak7ZKel7ReUluKnS9p\nlaStkp6UNLLUfq6kdklbJE0txSdL2ihpm6QFpfhQSUtTn7WSJtTyIHsTEV5/yswsQ+6Zxu+ASkRc\nGREtKXYnsDoiLgPWAHMBJF0O3AhMAqYBD+jYfNYHgTkRMRGYKOm6FJ8DdEbEpcACYP4pHpeZmdVB\nbtFQN21nAIvS9iLg+rQ9HVgaEYciYjvQDrRIGgsMj4h1qd3iUp/yvpYDU/pyEGZm1j9yi0YA35O0\nTtKfpNiYiOgAiIg9wOgUHwfsLPXdnWLjgF2l+K4UO65PRBwG9kka1cdjMTOzOst9nsbVEfFzSU3A\nKklbKQpJWS0vCvR4e3Zb2/0MG9YEQHNzhebmSg0/1sxs8KtWq1Sr1brsO6toRMTP09//T9J3gRag\nQ9KYiOhIQ097U/PdwAWl7uNTrKd4uc/LkoYAIyKis7tcWlpup6lpUtbBmZmdiSqVCpVK5ejr1tbW\nmu271+EpSedKen3aHgZMBTYBK4HZqdksYEXaXgnMTDOiLgIuAdrSENZ+SS3pwvjNXfrMSts3UFxY\n7zdee8rMLE/OmcYY4DFJkdp/KyJWSXoGWCbpFmAHxYwpImKzpGXAZuAgcGscm896G/AwcA7weEQ8\nkeILgSWS2oFXgJk1OTozM6upXotGRPwMeFs38U7g2h76zAPmdRN/Friim/gBUtExM7OBy3eEm5lZ\nNhcNMzPL5qJhZmbZcu/TOK153Skzszw+0zAzs2wuGmZmls1Fw8zMsrlomJlZNhcNMzPL5qKB154y\nM8vlomFmZtlcNMzMLJuLhpmZZXPRMDOzbC4aZmaWzWtP4bWnzMxy+UzDzMyyuWiYmVk2Fw0zM8uW\nXTQknSXpOUkr0+vzJa2StFXSk5JGltrOldQuaYukqaX4ZEkbJW2TtKAUHyppaeqzVtKEWh2gmZnV\nTl/OND4NbC69vhNYHRGXAWuAuQCSLgduBCYB04AHdGyNjgeBORExEZgo6boUnwN0RsSlwAJg/kke\nj5mZ1VFW0ZA0Hvgw8Nel8AxgUdpeBFyftqcDSyPiUERsB9qBFkljgeERsS61W1zqU97XcmBK3w/l\n5HntKTOzPLlnGl8DPgeU56aOiYgOgIjYA4xO8XHAzlK73Sk2DthViu9KseP6RMRhYJ+kUfmHYWZm\n/aHX+zQkfQToiIgNkionaFrLmx16/LW/re1+hg1rAqC5uUJz84lSMjM781SrVarVal32nXNz39XA\ndEkfBl4HDJe0BNgjaUxEdKShp72p/W7gglL/8SnWU7zc52VJQ4AREdHZXTItLbfT1DQp7+jMzM5A\nlUqFSqVy9HVra2vN9t3r8FREfDEiJkTExcBMYE1EfAL4W2B2ajYLWJG2VwIz04yoi4BLgLY0hLVf\nUku6MH5zlz6z0vYNFBfWzcxsgDmVZUTuAZZJugXYQTFjiojYLGkZxUyrg8CtcWydjtuAh4FzgMcj\n4okUXwgskdQOvEJRnMzMbIDpU9GIiO8D30/bncC1PbSbB8zrJv4scEU38QOkotMIXnvKzCyP7wg3\nM7NsLhpmZpbNRcPMzLK5aJiZWTYXDTMzy+aigdeeMjPL5aJhZmbZXDTMzCybi4aZmWVz0TAzs2wu\nGmZmlu1UFiw8bXjtKTOzPD7TMDOzbC4aZmaWzUXDzMyyuWiYmVk2Fw0zM8vmooHXnjIzy+WiYWZm\n2XotGpJeK+lHktZL2iTprhQ/X9IqSVslPSlpZKnPXEntkrZImlqKT5a0UdI2SQtK8aGSlqY+ayVN\nqPWBmpnZqeu1aETEAeADEXEl8DZgmqQW4E5gdURcBqwB5gJIuhy4EZgETAMe0LGxnweBORExEZgo\n6boUnwN0RsSlwAJgfq0O0MzMaidreCoi/jltvpbiLvIAZgCLUnwRcH3ang4sjYhDEbEdaAdaJI0F\nhkfEutRucalPeV/LgSkndTRmZlZXWUVD0lmS1gN7gO+l//jHREQHQETsAUan5uOAnaXuu1NsHLCr\nFN+VYsf1iYjDwD5Jo07qiMzMrG6y1p6KiN8BV0oaATwm6S0UZxvHNathXj1OZWpru59hw5oAaG6u\n0NxcOeUP89pTZnY6qVarVKvVuuy7TwsWRsSvJFWBDwEdksZEREcaetqbmu0GLih1G59iPcXLfV6W\nNAQYERGd3eXQ0nI7TU2T+pK2mdkZpVKpUKlUjr5ubW2t2b5zZk+98cjMKEmvA/4Q2AKsBGanZrOA\nFWl7JTAzzYi6CLgEaEtDWPsltaQL4zd36TMrbd9AcWHdzMwGmJwzjT8AFkk6i6LI/E1EPC7paWCZ\npFuAHRQzpoiIzZKWAZuBg8CtcWz85zbgYeAc4PGIeCLFFwJLJLUDrwAza3J0ZmZWU70WjYjYBEzu\nJt4JXNtDn3nAvG7izwJXdBM/QCo6ZmY2cPmOcDMzy+aigdeeMjPL5aJhZmbZXDTMzCybi4aZmWVz\n0TAzs2wuGmZmlq1Py4icrrz2lJlZHp9pmJlZNhcNMzPL5qJhZmbZXDTMzCybi4aZmWVz0cBrT5mZ\n5Trjp9yuX//80e3Zs+/usd2ECefx5S9/ph8yMjMbuM74ovHqq8fu0WhuvrvHdtu39/yemdmZwsNT\nZmaWzUXDzMyyuWiYmVm2XouGpPGS1kh6QdImSXek+PmSVknaKulJSSNLfeZKape0RdLUUnyypI2S\ntklaUIoPlbQ09VkraUKtD/RE7roruOsurz9lZtabnDONQ8CfRcRbgKuA2yS9GbgTWB0RlwFrgLkA\nki4HbgQmAdOAB3RsPuuDwJyImAhMlHRdis8BOiPiUmABML8mR2dmZjXVa9GIiD0RsSFt/wbYAowH\nZgCLUrNFwPVpezqwNCIORcR2oB1okTQWGB4R61K7xaU+5X0tB6acykGZmVl99OmahqRm4G3A08CY\niOiAorAAo1OzccDOUrfdKTYO2FWK70qx4/pExGFgn6RRfcnNzMzqL/s+DUmvpzgL+HRE/EZS14sA\ntbwo0OPt2W1t9zNsWBMAzc0VmpsrNfxYM7PBr1qtUq1W67LvrKIh6WyKgrEkIlakcIekMRHRkYae\n9qb4buCCUvfxKdZTvNznZUlDgBER0dldLi0tt9PUNCknbTOzM1KlUqFSqRx93draWrN95w5PfQPY\nHBH3lWIrgdlpexawohSfmWZEXQRcArSlIaz9klrShfGbu/SZlbZvoLiw3m9aW0Vrq9eeMjPrTa9n\nGpKuBj4ObJK0nmIY6ovAvcAySbcAOyhmTBERmyUtAzYDB4Fb49jzVG8DHgbOAR6PiCdSfCGwRFI7\n8AowszaHZ2ZmtdRr0YiIfwSG9PD2tT30mQfM6yb+LHBFN/EDpKJjZmYDl+8INzOzbC4aZmaWzUXD\nzMyynfHP0wC87pSZWSafaZiZWTYXDTMzy+aiYWZm2Vw0zMwsm4uGmZllc9HAa0+ZmeVy0TAzs2wu\nGmZmls1Fw8zMsrlomJlZNhcNMzPL5rWn8NpTZma5fKZhZmbZXDTMzCybi4aZmWXrtWhIWiipQ9LG\nUux8SaskbZX0pKSRpffmSmqXtEXS1FJ8sqSNkrZJWlCKD5W0NPVZK2lCLQ/QzMxqJ+dM4yHgui6x\nO4HVEXEZsAaYCyDpcuBGYBIwDXhA0pH1OR4E5kTERGCipCP7nAN0RsSlwAJg/ikcj5mZ1VGvs6ci\n4ilJF3YJzwCuSduLgCpFIZkOLI2IQ8B2Se1Ai6QdwPCIWJf6LAauB55M+7orxZcD95/84ZycI+tO\nnWgW1fr1zzN79t1Z+5sw4Ty+/OXP1CI1M7MB5WSn3I6OiA6AiNgjaXSKjwPWltrtTrFDwK5SfFeK\nH+mzM+3rsKR9kkZFROdJ5lYXr74aNDffndV2+/a8dmZmg02t7tOo5Y0OJ1xutq3tfoYNawKgublC\nc3Olhh9tZjb4VatVqtVqXfZ9skWjQ9KYiOiQNBbYm+K7gQtK7canWE/xcp+XJQ0BRpzoLKOl5Xaa\nmiadZNpmZqe/SqVCpVI5+rq1tbVm+86dciuOPwNYCcxO27OAFaX4zDQj6iLgEqAtIvYA+yW1pAvj\nN3fpMytt30BxYd3MzAagXs80JD0CVIA3SHqJ4qL1PcCjkm4BdlDMmCIiNktaBmwGDgK3RsSRoavb\ngIeBc4DHI+KJFF8ILEkXzV8BZtbm0MzMrNZyZk/9UQ9vXdtD+3nAvG7izwJXdBM/QCo6jeK1p8zM\n8viOcDMzy+aiYWZm2Vw0zMwsm5+nUQe5d4/7znEzG2xcNOog9+5x3zluZoONh6co1p46sv6UmZn1\nzEXDzMyyuWiYmVk2Fw0zM8vmomFmZtk8e6qBPDXXzAYbFw0at/aUp+aa2WDj4SkzM8vmM41BwMNY\nZjZQuGgMAh7GMrOBwsNTZmaWzWcapxEPY5lZvblowNF1pwb7E/xyh7Eee+yjvPTSvl7bubiYWVcD\npmhI+hCwgGLIbGFE3NvglE5bvkZiZidrQFzTkHQWcD9wHfAW4CZJb25sVnm2b682OoXfMxBzAqhW\nq41O4fc4pzzOKd9AzatWBsqZRgvQHhE7ACQtBWYAP2loVhm2b6/S3FxpdBrHqVVOuddIIG8oq1qt\nUqmcel615JzyOKd8AzWvWhkoRWMcsLP0ehdFIbEGyh3GgrzrJBs2VFmz5iYuvviyXvf34otbs9rV\n47rLl760wNd8zHowUIpGtv37f8Bvf7v+hG1++9tXkPxQpf6UU2C2b7+bXbs28MEPnrgdwFNPXZ/V\nLveifk9FaMOG6u9du1m/fjMf/eiyun92T+26y+lk9gf5ha23Qnkkp74eS63y6wsX/fpSRONnDEl6\nN3B3RHwovb4TiK4XwyU1Plkzs0EoImrym/RAKRpDgK3AFODnQBtwU0RsaWhiZmZ2nAExPBURhyXd\nDqzi2JRbFwwzswFmQJxpmJnZ4DAg7tPIIelDkn4iaZukL/TTZ46XtEbSC5I2Sbojxc+XtErSVklP\nShpZ6jNXUrukLZKm1jG3syQ9J2nlAMpppKRH0+e8IOldjc5L0mcl/VjSRknfkjS0v3OStFBSh6SN\npVifc5A0OR3HNkkL6pTX/PS5GyR9R9KI/syru5xK7/25pN9JGjUQcpL0qfS5myTd0+icJL1V0lpJ\n6yW1SXpHXXKKiAH/h6K4/RS4EHgNsAF4cz987ljgbWn79RTXXd4M3At8PsW/ANyTti8H1lMM+zWn\nnFWn3D4LfBNYmV4PhJweBj6Zts8GRjYyL+BNwIvA0PT6b4BZ/Z0T8F7gbcDGUqzPOQA/At6Zth8H\nrqtDXtcCZ6Xte4B5/ZlXdzml+HjgCeBnwKgUm9SonIAKxXD62en1GwdATk8CU9P2NODv6/FvN1jO\nNI7e/BcRB4EjN//VVUTsiYgNafs3wBaKb94ZwKLUbBFwfdqeDiyNiEMRsR1opw73m0gaD3wY+OtS\nuNE5jQDeFxEPAaTP29/ovIAhwDBJZwOvA3b3d04R8RTwyy7hPuUgaSwwPCLWpXaLS31qlldErI6I\n36WXT1N8v/dbXj18rQC+BnyuS2xGA3P69xSF/lBq84sBkNPvKH5RAziP4nsdavxvN1iKRnc3/43r\nzwQkNVNU9qeBMRHRAUVhAUanZl3z3E198jzyA1S+INXonC4CfiHpoTRs9nVJ5zYyr4h4Gfgq8FLa\n//6IWN3InEpG9zGHcRTf90f0x8/ALRS/fTY0L0nTgZ0RsanLW438Wk0E3i/paUl/L+ntAyCnzwL/\nRdJLwHxgbj1yGixFo6EkvR5YDnw6nXF0nT3Qb7MJJH0E6EhnQCead93fMxzOBiYD/z0iJgOvAnd2\nk0d/fq3Oo/jN70KKoaphkj7eyJxOYCDkcJSk/wAcjIhvNziP1wFfBO5qZB7dOBs4PyLeDXweeLTB\n+UBx9vPpiJhAUUC+UY8PGSxFYzcwofR6PMdOveoqDWssB5ZExIoU7pA0Jr0/FthbyvOCOud5NTBd\n0ovAt4EPSloC7GlgTlD8lrIzIp5Jr79DUUQa+bW6FngxIjoj4jDwGPCeBud0RF9z6LfcJM2mGP78\no1K4UXn9K4px+Ocl/Szt/zlJo+n5/4X++FrtBP4XQBreOSzpDQ3OaVZEfDfltBx4Z4rX9N9usBSN\ndcAlki6UNBSYCazsp8/+BrA5Iu4rxVYCs9P2LGBFKT5TxQydi4BLKG5UrJmI+GJETIiIiym+Dmsi\n4hPA3zYqp5RXB7BT0sQUmgK8QAO/VhTDUu+WdI4kpZw2NygncfyZYZ9ySENY+yW1pGO5udSnZnmp\neETB54DpEXGgS779ldfRnCLixxExNiIujoiLKH45uTIi9qacPtbfOSXfBT4IkL7nh0bEKw3Oabek\na1JOUyiuXUCt/+1O9up9f/8BPkQxe6kduLOfPvNq4DDFbK31wHMpj1HA6pTPKuC8Up+5FLMTtpBm\nMtQxv2s4Nnuq4TkBb6Uo8Bsofgsb2ei8KIY1tgAbKS44v6a/cwIeAV4GDlAUsk8C5/c1B+DtwKb0\nM3BfnfJqB3ak7/XngAf6M6/ucury/ouk2VONzIlieGpJ+oxngGsGQE7vSbmsB9ZSFNea5+Sb+8zM\nLNtgGZ4yM7MBwEXDzMyyuWiYmVk2Fw0zM8vmomFmZtlcNMzMLJuLhg1qkv5S0jWSZqgflsyX9DOV\nluau02dcI+mqen6G2cly0bDB7l0UyztfA/xDLXes4jHEXfXHjU0Vihu1svWQq1nNuWjYoKTiYUHP\nA+8Afgj8CfCgpL+Q1CTpmdTurSoe3DM+vf5pWlbkQkn/V8XDhr5Xev8hSQ9Kehq4V9IoFQ9J2iTp\nf5KWbZB0rqS/U/HAm42SbuiS3xAVD8J5f3o9T9J/7OY47lDxwKoNkh6RdCHwp8Bn0mrBV2fkuhaY\nr+JBOm9I70nFQ3feUIcvv53BBsQzws36KiI+L2kZ8Angz4BqRLzvyPuSXqtideL3Uixt8j5J/0ix\nQvBvJf0V8FBEfFPSJ4G/Aj6auo+LYvVSJN0H/CAiviLpwxTLhUOxnMzuiPg3qd3wLvkdTgv/Pari\niY9TKc6KuvoC0BwRByWNiIhfSfofwK8j4r+mfa/sJderUrt9wB8D91Es1rghivWQzGrGZxo2mE2m\nWFNqEvCTLu/9kKJgvB/4S4rhq/cBP0jvX0WxSjAUawhdXepbXub6/RRPSCQiHufYg282AX+YziDe\nGxG/7ppcRGxOff+OYg2lQ90cw/PAIyqWbD/cw3Hm5voQRRGForg91MP+zE6ai4YNOmnIaT3wn4A/\nB/43MDUN57w2NfsBRZGYEMWS9m+l+M/2SNE40bWJV0vbXdsdWX21naJobQK+IukvetjXFRSFZkwP\n738EuD/ta52k7n4ms3KNiF0US65/gGJZ7P9zgn5mJ8VFwwadiHg+Iq4EtkbE5cAaipU7J8ex5bx/\nQDFUc2R56E6KZ0Q8lV7/ELgpbf8xx4pJV/8AfBxA0jSKx2gi6Q+Af4mIR4D/TPGf/nEk/VuK1Wzf\nD9yv4pG45fdFUdS+T/HAqhEUz6L/ddo+IjdXgIUUZzfLwquRWh24aNigJOmNHBsquiwitpbfj4gd\nafP76e+ngH1RPLcc4A7gk5I2UBSFTx/p2uWjvkzxWM9NFM9PfinFrwDa0hnPl4CvdMnvDRTDYnMi\n4qcU1yHKz2SB4vnl30wX9J+lWJr6VxTPRvnokQvhwKcyc4Xi2QnDgIe7ec/slHlpdLPTiKR3AF+N\niGsanYudnjx7yuw0kW5u/FOOf0yrWU35TMPMzLL5moaZmWVz0TAzs2wuGmZmls1Fw8zMsrlomJlZ\nNhcNMzPL9v8BAdbztcCvKUMAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x7fafc3877d10>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from scipy import stats\n",
+    "import numpy as np\n",
+    "\n",
+    "lens = map(len, (x for x, _ in train_stories + test_stories))\n",
+    "print stats.describe(lens)\n",
+    "plt.xlabel('#words x story')\n",
+    "plt.hist(lens, bins=30,alpha=0.5)\n",
+    "plt.axvline(np.array(lens).mean(), color='black', linestyle='dashed', linewidth=2)\n",
+    "plt.savefig('plots/word_by_story.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DescribeResult(nobs=192487, minmax=(2, 118), mean=11.537999968829064, variance=153.05061042999489, skewness=2.9519698043011893, kurtosis=12.82327875307992)\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEPCAYAAABcA4N7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHFZJREFUeJzt3XGMVed95vHvYzAhTgDjTcANmBgLQ8ByY9MtSerN5jZ4\nwaQV0D/sxY1iqIn6B2yTNKtoISuVmUhbalZd42gXS1GojZETFpN6oStkTxG5WbkigWAcbENgti4Y\nhjCOmUBtV4sA//aP82LOjGfOnJm545m59/lII879zfueOa/Hmuee95zzXkUEZmZmPbluqA/AzMyG\nNweFmZkVclCYmVkhB4WZmRVyUJiZWSEHhZmZFSoVFJL+XNIrkg5LelrSGEkTJbVIOibpeUkTcu3X\nSmqVdFTSglx9btrHcUkbc/UxkralPvskTavtMM3MrL96DQpJnwD+DJgbEb8NjAYeBNYAeyJiFrAX\nWJvazwEeAGYDi4BNkpR29ziwMiJmAjMlLUz1lUBHRNwObAQ21Gh8ZmY2QGWnnkYBH5E0Gvgw0AYs\nAbak728BlqbtxcC2iLgcESeAVmCepJuBcRFxILV7Ktcnv68dwPz+DcfMzGqt16CIiDPAXwOvkwXE\nhYjYA0yOiPbU5iwwKXWZApzK7aIt1aYAp3P106nWqU9EXAHOS7qpn2MyM7MaKjP1dCPZO/5PAp8g\nO7P4MtB17Y9argWi3puYmdkHYXSJNvcCr0VEB4CkZ4HfA9olTY6I9jSt9EZq3wbckus/NdV6quf7\nnJE0Chh/9eflSfLCVGZm/RAR/X4DXuYaxevAZyWNTRel5wNHgF3AitRmObAzbe8ClqU7maYDM4D9\naXrqgqR5aT8PdemzPG3fT3ZxvFsR0enrrbfeoq2trdTX+fPn39e/7FdPP7+WX+vWrRvU/Q/1Vz2P\nr57H5vGN/K+B6vWMIiL2S9oBHAIupX+/B4wDtkt6GDhJdqcTEXFE0nayMLkErIprR7oaeBIYC+yO\niOdSfTOwVVIrcA5YVnYA3/3u0xw/fpnRo68vbPfuu+/y8Y+/wyOP/MeyuzYzM8pNPRERzUBzl3IH\n2bRUd+3XA+u7qR8E7uymfpEUNH319tuXmDRpGTfc8LHCdpcv/z/OnXu0Pz/CzKyh+cnsYaRSqQz1\nIQyqeh5fPY8NPL5G56AYRur9f9Z6Hl89jw08vkbnoDAzs0KlrlEYNblzwMxsJPIZhZmZFXJQmJlZ\nIQeFmZkVclCYmVkhB4WZmRVyUJQkiWufv2Rm1jgcFGZmVshBYWZmhRwUZmZWyEFhZmaFHBRmZlbI\naz2V5LWezKxR+YzCzMwKOSjMzKyQg8LMzAr1GhSSZko6JOnF9O8FSV+TNFFSi6Rjkp6XNCHXZ62k\nVklHJS3I1edKOizpuKSNufoYSdtSn32SptV+qGZm1h+9BkVEHI+IuyNiLvA7wDvAs8AaYE9EzAL2\nAmsBJM0BHgBmA4uATbq29sXjwMqImAnMlLQw1VcCHRFxO7AR2FCrAZqZ2cD0derpXuAfI+IUsATY\nkupbgKVpezGwLSIuR8QJoBWYJ+lmYFxEHEjtnsr1ye9rBzC/rwMZbF7rycwaVV+D4t8DP0jbkyOi\nHSAizgKTUn0KcCrXpy3VpgCnc/XTqdapT0RcAc5LuqmPx2ZmZoOgdFBIup7sbOGZVOr6YEEtHzTw\nW3czs2GiLw/cLQIORsSb6XW7pMkR0Z6mld5I9Tbglly/qanWUz3f54ykUcD4iOjo7iCampre265U\nKn04fDOzxlCtVqlWqzXbX1+C4kHgh7nXu4AVwCPAcmBnrv60pEfJppRmAPsjItIdU/OAA8BDwHdz\nfZYDPwPuJ7s43q18UAC0tLzShyGYmdW/SqXS6Y10c3PzgPZXKigk3UB2IftPc+VHgO2SHgZOkt3p\nREQckbQdOAJcAlbFtfUvVgNPAmOB3RHxXKpvBrZKagXOAcsGMigzM6udUkEREf8CfLxLrYMsPLpr\nvx5Y3039IHBnN/WLpKAZrrzWk5k1Kj+ZbWZmhRwUZmZWyEFhZmaFHBRmZlbIQWFmZoUcFCV5rScz\na1QOCjMzK+SgMDOzQg4KMzMr5KAwM7NCDgozMyvUl9VjG5rXejKzRuUzCjMzK+SgMDOzQg4KMzMr\n5KAwM7NCDgozMyvkoCjJaz2ZWaNyUJiZWaFSQSFpgqRnJB2V9Kqkz0iaKKlF0jFJz0uakGu/VlJr\nar8gV58r6bCk45I25upjJG1LffZJmlbbYZqZWX+VPaN4DNgdEbOBTwO/BNYAeyJiFrAXWAsgaQ7w\nADAbWARs0rU5m8eBlRExE5gpaWGqrwQ6IuJ2YCOwYcAjMzOzmug1KCSNBz4fEU8ARMTliLgALAG2\npGZbgKVpezGwLbU7AbQC8yTdDIyLiAOp3VO5Pvl97QDmD2hUZmZWM2XOKKYDb0p6QtKLkr4n6QZg\nckS0A0TEWWBSaj8FOJXr35ZqU4DTufrpVOvUJyKuAOcl3dTPMZmZWQ2VWetpNDAXWB0RP5f0KNm0\nU9fFj2q5GFKPtxc1NTW9t12pVGr4I4t5rSczGymq1SrVarVm+ysTFKeBUxHx8/T6R2RB0S5pckS0\np2mlN9L324Bbcv2nplpP9XyfM5JGAeMjoqO7g8kHBUBLyyslhmBm1jgqlUqnN9LNzc0D2l+vU09p\neumUpJmpNB94FdgFrEi15cDOtL0LWJbuZJoOzAD2p+mpC5LmpYvbD3Xpszxt3092cdzMzIaBssuM\nfw14WtL1wGvAnwCjgO2SHgZOkt3pREQckbQdOAJcAlbFtXmb1cCTwFiyu6ieS/XNwFZJrcA5YNlA\nB2ZmZrVRKigi4hfA73bzrXt7aL8eWN9N/SBwZzf1i6SgMTOz4cVPZpuZWSEHRUle68nMGpWDwszM\nCjkozMyskIPCzMwKOSjMzKyQg8LMzAqVfeCu4XmtJzNrVD6jMDOzQg4KMzMr5KAwM7NCDgozMyvk\noDAzs0IOipK81pOZNSoHhZmZFXJQmJlZIQeFmZkVclCYmVkhB4WZmRUqFRSSTkj6haRDkvan2kRJ\nLZKOSXpe0oRc+7WSWiUdlbQgV58r6bCk45I25upjJG1LffZJmlbLQdZCRHi9JzNrSGXPKN4FKhFx\nd0TMS7U1wJ6ImAXsBdYCSJoDPADMBhYBm3TtvtLHgZURMROYKWlhqq8EOiLidmAjsGGA4zIzsxop\nGxTqpu0SYEva3gIsTduLgW0RcTkiTgCtwDxJNwPjIuJAavdUrk9+XzuA+X0ZhJmZDZ6yQRHA30s6\nIOmrqTY5ItoBIuIsMCnVpwCncn3bUm0KcDpXP51qnfpExBXgvKSb+jgWMzMbBGU/j+KeiPiVpI8D\nLZKOkYVHXi0n8Ht8BLqpqem97UqlUsMfaWZWH6rVKtVqtWb7KxUUEfGr9O+vJf0vYB7QLmlyRLSn\naaU3UvM24JZc96mp1lM93+eMpFHA+Ijo6O5Y8kEB0NLySpkhmJk1jEql0umNdHNz84D21+vUk6Qb\nJH00bX8EWAC8DOwCVqRmy4GdaXsXsCzdyTQdmAHsT9NTFyTNSxe3H+rSZ3navp/s4viw4rWezKxR\nlTmjmAw8KylS+6cjokXSz4Htkh4GTpLd6UREHJG0HTgCXAJWxbX7SlcDTwJjgd0R8Vyqbwa2SmoF\nzgHLajI6MzMbsF6DIiL+Cbirm3oHcG8PfdYD67upHwTu7KZ+kRQ0ZmY2vPjJbDMzK+SgMDOzQg4K\nMzMrVPY5iobndZ7MrFH5jMLMzAo5KMzMrJCDwszMCjkozMyskIPCzMwKOShK8lpPZtaoHBRmZlbI\nQWFmZoUcFGZmVshBYWZmhRwUZmZWyGs9leS1nsysUfmMwszMCjkozMyskIPCzMwKlQ4KSddJelHS\nrvR6oqQWScckPS9pQq7tWkmtko5KWpCrz5V0WNJxSRtz9TGStqU++yRNq9UAzcxsYPpyRvF14Eju\n9RpgT0TMAvYCawEkzQEeAGYDi4BNurb2xePAyoiYCcyUtDDVVwIdEXE7sBHY0M/xmJlZjZUKCklT\ngS8B38+VlwBb0vYWYGnaXgxsi4jLEXECaAXmSboZGBcRB1K7p3J98vvaAczv+1AGl9d6MrNGVfaM\n4lHgW0D+HtHJEdEOEBFngUmpPgU4lWvXlmpTgNO5+ulU69QnIq4A5yXdVH4YZmY2WHp9jkLSHwDt\nEfGSpEpB01o+aNDjW/empqb3tiuVSg1/pJlZfahWq1Sr1Zrtr8wDd/cAiyV9CfgwME7SVuCspMkR\n0Z6mld5I7duAW3L9p6ZaT/V8nzOSRgHjI6Kju4PJBwVAS8srJYZgZtY4KpVKpzfSzc3NA9pfr1NP\nEfHtiJgWEbcBy4C9EfEV4O+AFanZcmBn2t4FLEt3Mk0HZgD70/TUBUnz0sXth7r0WZ627ye7OG5m\nZsPAQJbw+Ctgu6SHgZNkdzoREUckbSe7Q+oSsCqurX+xGngSGAvsjojnUn0zsFVSK3COLJDMzGwY\n6FNQRMRPgJ+k7Q7g3h7arQfWd1M/CNzZTf0iKWiGK6/1ZGaNyk9mm5lZoYZaPXb//oOsWNFUqu20\naTfyne98Y3APyMxsBGiooHj77Xe59damUm1PnCjXzsys3nnqyczMCjkozMyskIOiJK/1ZGaNykFh\nZmaFHBRmZlbIQWFmZoUcFGZmVshBYWZmhRrqgbuB8FpPZtaofEZhZmaFHBRmZlbIQWFmZoUcFGZm\nVshBYWZmhRwUJXmtJzNrVA4KMzMr1GtQSPqQpJ9JOiTpZUnrUn2ipBZJxyQ9L2lCrs9aSa2Sjkpa\nkKvPlXRY0nFJG3P1MZK2pT77JE2r9UDNzKx/eg2KiLgI/H5E3A3cBSySNA9YA+yJiFnAXmAtgKQ5\nwAPAbGARsEnX5mweB1ZGxExgpqSFqb4S6IiI24GNwIZaDdDMzAam1NRTRPxL2vwQ2dPcASwBtqT6\nFmBp2l4MbIuIyxFxAmgF5km6GRgXEQdSu6dyffL72gHM79dozMys5koFhaTrJB0CzgJ/n/7YT46I\ndoCIOAtMSs2nAKdy3dtSbQpwOlc/nWqd+kTEFeC8pJv6NSIzM6upUms9RcS7wN2SxgPPSrqD7Kyi\nU7MaHlePtxc1NTW9t12pVGr4I4t5rSczGymq1SrVarVm++vTooAR8c+SqsB9QLukyRHRnqaV3kjN\n2oBbct2mplpP9XyfM5JGAeMjoqO7Y8gHBUBLyyt9GYKZWd2rVCqd3kg3NzcPaH9l7nr62NU7miR9\nGPh3wFFgF7AiNVsO7Ezbu4Bl6U6m6cAMYH+anrogaV66uP1Qlz7L0/b9ZBfHzcxsGChzRvFbwBZJ\n15EFy/+MiN2Sfgpsl/QwcJLsTici4oik7cAR4BKwKq7N26wGngTGArsj4rlU3wxsldQKnAOW1WR0\nZmY2YL0GRUS8DMztpt4B3NtDn/XA+m7qB4E7u6lfJAWNmZkNL34y28zMCjkoSvJaT2bWqBwUZmZW\nyEFhZmaFHBRmZlbIQWFmZoUcFGZmVqhPS3g0Mq/1ZGaNymcUZmZWyEFhZmaFHBRmZlbIQWFmZoUc\nFGZmVshBUZLXejKzRuWgMDOzQg4KMzMr5KAwM7NCfjK7B4cO/YIVK5reV++uNm3ajXznO98Y/IMy\nMxsCDooevPNOcOutTblKM0CXWubEiffXzMzqRa9TT5KmStor6VVJL0v6WqpPlNQi6Zik5yVNyPVZ\nK6lV0lFJC3L1uZIOSzouaWOuPkbSttRnn6RptR7oQK1bF6xb5/WezKzxlLlGcRn4ZkTcAXwOWC3p\nU8AaYE9EzAL2AmsBJM0BHgBmA4uATbp2X+njwMqImAnMlLQw1VcCHRFxO7AR2FCT0ZmZ2YD1GhQR\ncTYiXkrbbwNHganAEmBLarYFWJq2FwPbIuJyRJwAWoF5km4GxkXEgdTuqVyf/L52APMHMigzM6ud\nPt31JOlW4C7gp8DkiGiHLEyASanZFOBUrltbqk0BTufqp1OtU5+IuAKcl3RTX47NzMwGR+mL2ZI+\nSvZu/+sR8bakrhP2tZzA7/ER6Kampve2K5VKDX+kmVl9qFarVKvVmu2vVFBIGk0WElsjYmcqt0ua\nHBHtaVrpjVRvA27JdZ+aaj3V833OSBoFjI+Iju6OJR8UAC0tr5QZgplZw6hUKp3eSDc3Nw9of2Wn\nnv4GOBIRj+Vqu4AVaXs5sDNXX5buZJoOzAD2p+mpC5LmpYvbD3Xpszxt3092cXxYaW4Wzc1e68nM\nGk+vZxSS7gG+DLws6RDZFNO3gUeA7ZIeBk6S3elERByRtB04AlwCVsW1zxFdDTwJjAV2R8Rzqb4Z\n2CqpFTgHLKvN8MzMbKB6DYqI+AdgVA/fvreHPuuB9d3UDwJ3dlO/SAoaMzMbXrzWk5mZFXJQmJlZ\nIQeFmZkV8qKAJXmdJzNrVD6jMDOzQg4KMzMr5KAwM7NCDgozMyvkoDAzs0K+66mkq+s8dXf3U0+f\nr90df762mY00DooaeP/na/fMn69tZiONp57MzKyQg8LMzAo5KMzMrJCDwszMCvlidkle68nMGpXP\nKMzMrJCDwszMCjkozMysUK9BIWmzpHZJh3O1iZJaJB2T9LykCbnvrZXUKumopAW5+lxJhyUdl7Qx\nVx8jaVvqs0/StFoO0MzMBqbMGcUTwMIutTXAnoiYBewF1gJImgM8AMwGFgGbJCn1eRxYGREzgZmS\nru5zJdAREbcDG4ENAxiPmZnVWK9BEREvAL/pUl4CbEnbW4ClaXsxsC0iLkfECaAVmCfpZmBcRBxI\n7Z7K9cnvawcwvx/jGHTNzXpvvSczs0bS39tjJ0VEO0BEnJU0KdWnAPty7dpS7TJwOlc/nepX+5xK\n+7oi6bykmyKio5/HNqx5AUEzG2lq9RxFLR8yKHzb3tTU9N52pVKp4Y/9YHgBQTMbbNVqlWq1WrP9\n9Tco2iVNjoj2NK30Rqq3Abfk2k1NtZ7q+T5nJI0CxhedTeSDAqCl5ZV+DsHMrD5VKpVOb6Sbm5sH\ntL+yt8eKzu/0dwEr0vZyYGeuvizdyTQdmAHsj4izwAVJ89LF7Ye69Fmetu8nuzhuZmbDRK9nFJJ+\nAFSAfyXpdWAd8FfAM5IeBk6S3elERByRtB04AlwCVkXE1Wmp1cCTwFhgd0Q8l+qbga2SWoFzwLLa\nDM3MzGqh16CIiD/u4Vv39tB+PbC+m/pB4M5u6hdJQTOcea0nM2tUfjLbzMwKefXYYazsrbS+jdbM\nBpODYhgreyutb6M1s8HkqSczMyvkoDAzs0IOipK81pOZNSpfo6gDXj/KzAaTg6IOeP0oMxtMnnoy\nM7NCDgozMyvkoDAzs0K+RlGS13oys0blMwozMyvkoDAzs0KeemowfXnm4rXXjnHbbbNKtfXzGWb1\ny0HRYPryzMULLyzli18s19bPZ5jVL089mZlZIZ9RlHR1nSff/dQ9LyNiVr+GTVBIug/YSHaWszki\nHhniQ7I+8DIiZvVrWASFpOuA/w7MB84AByTtjIhfDu2RfbBOnKhy662VoT6MQXN1fPX4yX3VapVK\npTLUhzFoPL7GNiyCApgHtEbESQBJ24AlgIOijlwdX9mzj2ef/SNef/18qX0PdajU+x8aj6+xDZeg\nmAKcyr0+TRYe1sD6Mp01kkLFbKQZLkHRbx/60HW0tf1vRo8eU9gu4l1GjfIHD9WrwQqVss+S/PjH\nOzhxotQu+/R8ylC3vdrupZeqvV5bGqznbv7iLzbW/PfVtV3R+PzGAhQx9HfxSPos0BQR96XXa4Do\nekFb0tAfrJnZCBQR/X6nPFyCYhRwjOxi9q+A/cCDEXF0SA/MzMyGx9RTRFyR9B+AFq7dHuuQMDMb\nBobFGYWZmQ1fI2YJD0n3SfqlpOOS/tNQH89ASJoqaa+kVyW9LOlrqT5RUoukY5KelzRhqI91ICRd\nJ+lFSbvS67oZn6QJkp6RdDT9Hj9TZ+P7c0mvSDos6WlJY0by+CRtltQu6XCu1uN4JK2V1Jp+vwuG\n5qjL6WFsG9KxvyTpR5LG577X57GNiKDIPZC3ELgDeFDSp4b2qAbkMvDNiLgD+BywOo1nDbAnImYB\ne4G1Q3iMtfB14EjudT2N7zFgd0TMBj5N9sxPXYxP0ieAPwPmRsRvk01RP8jIHt8TZH8/8rodj6Q5\nwAPAbGARsEnScL5lsruxtQB3RMRdQCsDHNuICApyD+RFxCXg6gN5I1JEnI2Il9L228BRYCrZmLak\nZluApUNzhAMnaSrwJeD7uXJdjC+9O/t8RDwBEBGXI+ICdTK+ZBTwEUmjgQ8DbYzg8UXEC8BvupR7\nGs9iYFv6vZ4g+0M7bJ/r6m5sEbEnIt5NL39K9vcF+jm2kRIU3T2QN2WIjqWmJN0K3EX2y5wcEe2Q\nhQkwaeiObMAeBb4F5C+C1cv4pgNvSnoiTa19T9IN1Mn4IuIM8NfA62QBcSEi9lAn48uZ1MN4uv69\naWNk/715GNidtvs1tpESFHVJ0keBHcDX05lF1zsLRuSdBpL+AGhPZ01Fp7UjcnxkUzFzgf8REXOB\nd8imMerl93cj2bvtTwKfIDuz+DJ1Mr4C9TYeJP1n4FJE/HAg+xkpQdEGTMu9nppqI1Y6pd8BbI2I\nnancLmly+v7NwBtDdXwDdA+wWNJrwA+BL0raCpytk/GdBk5FxM/T6x+RBUe9/P7uBV6LiI6IuAI8\nC/we9TO+q3oaTxtwS67diPx7I2kF2fTvH+fK/RrbSAmKA8AMSZ+UNAZYBuwa4mMaqL8BjkTEY7na\nLmBF2l4O7OzaaSSIiG9HxLSIuI3sd7U3Ir4C/B31Mb524JSkmak0H3iVOvn9kU05fVbS2HShcz7Z\nTQkjfXyi8xluT+PZBSxLd3pNB2aQPQQ8nHUaW/rYhm8BiyPiYq5d/8YWESPiC7iP7OntVmDNUB/P\nAMdyD3AFeAk4BLyYxncTsCeNswW4caiPtQZj/QKwK23XzfjI7nQ6kH6HfwtMqLPxrSO7yeIw2YXe\n60fy+IAfkH2EwUWyIPwTYGJP4yG7S+j/pv8GC4b6+PsxtlbgZPrb8iKwaSBj8wN3ZmZWaKRMPZmZ\n2RBxUJiZWSEHhZmZFXJQmJlZIQeFmZkVclCYmVkhB4VZNyT9paQvSFpydVl7SbMkHZJ0MD2s1Jf9\nfUHS5wbnaM0Gl4PCrHufAX5G9sDg/0m1pcAzEfE7EfFPfdxfhWwZDLMRxw/cmeVI2kC2tv+twD+S\nLXHwGtl6TqvInqg/FhHzJT1LtlbOWOCxiPh+2sd9wH8heyP2JvBVstWBLwO/Jvush98ie/r5Mtnq\nrJUPZoRmfeegMOtC0r8GvgJ8E6hGxOdTfR3wVkT8t/T6xog4L2ks2XIe/5bscxxeBP5NRLyea9O1\n72FgYUT8StL4iPjnD3ygZiV56sns/eaSrXE0m+yT63ryDUkvce2DYW4HPgv8JCJeB4iI8z30fQHY\nIumrZMuWmw1b/h/ULJH0aeBJsj/6vwY+kuovkn1kbb7tF4AvAp+JiIuSfkw2BQXFn8EBQESskvS7\nwB8CByXNjYiun8BmNiz4jMIsiYhfRMTdZNcg5pB9jvKCiJgbnZdqhmy12N+kkPgU2ZkEZGcXn5f0\nSQBJE1P9LSD/Afe3RcSBiFhH9jkI+c8IMBtWHBRmOZI+xrXPH54VEcd6aPoccL2kV4G/BPYBRMSb\nwJ8Cz0o6RPb57pB9FscfpY9OvQf4r5IOp2sV/xARhwdpSGYD5ovZZmZWyGcUZmZWyEFhZmaFHBRm\nZlbIQWFmZoUcFGZmVshBYWZmhRwUZmZWyEFhZmaF/j+bsAEB1lOwIAAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x7fafc8392bd0>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "lens = map(len, (x for x, _ in train_facts + test_facts))\n",
+    "print stats.describe(lens)\n",
+    "plt.xlabel('#facts')\n",
+    "plt.hist(lens, bins=30,alpha=0.5)\n",
+    "plt.axvline(np.array(lens).mean(), color='black', linestyle='dashed', linewidth=2)\n",
+    "plt.savefig('plots/facts_by_disease.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8\n",
+      "DescribeResult(nobs=2220915, minmax=(2, 111), mean=10.626865053367643, variance=63.421087757828843, skewness=1.34814656662994, kurtosis=3.0186643322827527)\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZEAAAEPCAYAAACDTflkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X+U1fV95/HnSyYUkwhCjZiAqDkKwaxNJBGS2iR3VX6l\nPej2rC5pUgYl7WllY2zO5hSyPTKjPaXa0xPcdPVsWirIJmGRrAttOTKheNNMYiogihWC0zUoDGFo\nHJ1NTEIB3/vH9wN8nd5h7nzvhZm5vB7nzOFz39/P5zPfz4ze93w+3+/3cxURmJmZFXHeYJ+AmZkN\nX04iZmZWmJOImZkV5iRiZmaFOYmYmVlhTiJmZlZYVUlE0h9I+idJuyR9TdJISWMltUnaK2mzpDG5\n+ksldUjaI2lWLj4t9fGipBW5+EhJa1ObpyRNyh1rTvX3SlpQr4GbmVnt+k0ikt4DfA6YFhG/AjQB\nnwKWAFsiYgqwFVia6l8N3AZMBeYCD0lS6u5hYFFETAYmS5qd4ouA7oi4ClgBPJD6GgvcA1wHzACW\n5ZOVmZkNrmqXs0YA75DUBJwPdAI3A6vT8dXALak8D1gbEcciYh/QAUyXdAlwQURsS/UezbXJ97Ue\nuCGVZwNtEdETEa8DbcCcgQ3RzMzOlH6TSEQcBP4ceIUsefRExBZgfER0pTqHgItTkwnA/lwXnSk2\nATiQix9Isbe0iYjjQI+kcafpy8zMhoBqlrMuJJspXAa8h2xG8mmg934p9dw/Rf1XMTOzwdZURZ2b\ngJciohtA0uPArwJdksZHRFdaqjqc6ncCl+baT0yxvuL5NgcljQBGR0S3pE6g1KvNk71PUJI3ADMz\nKyAiavqjvZprIq8AH5E0Kl0gvxHYDWwEFqY6zcCGVN4IzE93XF0BXAk8nZa8eiRNT/0s6NWmOZVv\nJbtQD7AZmClpTLrIPjPF/o2IOKtfZ/P7Llu27KyP72x+eXzD+6uRx9fIY4uoz9/e/c5EIuJpSeuB\nncDR9O9XgQuAdZLuAF4muyOLiNgtaR1ZojkK3BmnznYxsAoYBWyKiCdSfCWwRlIH8CowP/X1mqT7\ngO1ky2WtkV1gNzOzIaCa5SwiohVo7RXuJlvqqlR/ObC8QnwHcE2F+BFSEqpwbBVZ4jEzsyHGT6wP\nA6VSabBP4Yzy+Ia3Rh5fI4+tXlSvdbHBJCnO9jhOPD/ZCD8/Mzs3SSLOwoV1MzOziqq6JmL/lmcg\nZmaeiZiZWQ0adiZy/PhxDh48WFXdiy66iPPPP/8Mn5GZWeNp2CSybds2vvKV73L++Reett6RIz9j\n9uyJfOYz/+EsnZmZWeNo2CRy7NgxRo78ABMnVnyU5aTDh1/g5z9/4SydlZlZY/E1ETMzK8xJpCBJ\nJ58VMTM7VzmJmJlZYU4iZmZWmJOImZkV5iRiZmaFOYmYmVlhDfucyJnmvbPMzDwTMTOzGjiJmJlZ\nYU4iZmZWWL9JRNJkSTslPZP+7ZF0l6Sxktok7ZW0WdKYXJulkjok7ZE0KxefJmmXpBclrcjFR0pa\nm9o8JWlS7lhzqr9X0oJ6Dt7MzGrTbxKJiBcj4tqImAZ8CHgDeBxYAmyJiCnAVmApgKSrgduAqcBc\n4CGd2h/kYWBRREwGJkuaneKLgO6IuApYATyQ+hoL3ANcB8wAluWTlZmZDa6BLmfdBPzfiNgP3Ays\nTvHVwC2pPA9YGxHHImIf0AFMl3QJcEFEbEv1Hs21yfe1HrghlWcDbRHRExGvA23AnAGe8xnhvbPM\nzAaeRP4T8PVUHh8RXQARcQi4OMUnAPtzbTpTbAJwIBc/kGJvaRMRx4EeSeNO05eZmQ0BVScRSW8j\nm2U8lkK9H5So54MT/hPfzGwYGMjDhnOBHRHx4/S6S9L4iOhKS1WHU7wTuDTXbmKK9RXPtzkoaQQw\nOiK6JXUCpV5tnqx0ci0tLSfLpVKJpiY/R2lmllculymXy3XtU9U+eS3pG8ATEbE6vb6f7GL4/ZL+\nEBgbEUvShfWvkV0InwB8C7gqIkLS94G7gG3A3wH/LSKekHQn8O8i4k5J84FbImJ+urC+HZhGNmva\nDnwoXR/Jn1v0Hkd7ezsrV/6Cyy7r/5MNr732BX7nd26r6ueQ+56An1w3s+FLEhFR08pPVX+uS3o7\n2UX1382F7wfWSboDeJnsjiwiYrekdcBu4ChwZ+4dfjGwChgFbIqIJ1J8JbBGUgfwKjA/9fWapPvI\nkkcArb0TiJmZDZ6qkkhE/Ax4V69YN1liqVR/ObC8QnwHcE2F+BFSEqpwbBVZ4hlSPAMxM/MT62Zm\nVgMnETMzK8xJxMzMCnMSMTOzwpxEzMysMCeRgrx3lpmZk4iZmdXAScTMzApzEjEzs8KcRMzMrDAn\nETMzK8z7pRfkvbPMzDwTMTOzGjiJmJlZYU4iZmZWmJOImZkV5iRiZmaFOYkU5L2zzMycRMzMrAZV\nJRFJYyQ9JmmPpBckzZA0VlKbpL2SNksak6u/VFJHqj8rF58maZekFyWtyMVHSlqb2jwlaVLuWHOq\nv1fSgnoN3MzMalftTORBYFNETAU+APwAWAJsiYgpwFZgKYCkq4HbgKnAXOAhnVr3eRhYFBGTgcmS\nZqf4IqA7Iq4CVgAPpL7GAvcA1wEzgGX5ZGVmZoOr3yQiaTTwsYh4BCAijkVED3AzsDpVWw3cksrz\ngLWp3j6gA5gu6RLggojYluo9mmuT72s9cEMqzwbaIqInIl4H2oA5hUZqZmZ1V81M5Argx5IekfSM\npK9KejswPiK6ACLiEHBxqj8B2J9r35liE4ADufiBFHtLm4g4DvRIGneavszMbAioZu+sJmAasDgi\ntkv6MtlSVu/No+q5mdSAb3tqaWk5WS6VSjQ1ndltwbx3lpkNN+VymXK5XNc+q3mnPQDsj4jt6fU3\nyZJIl6TxEdGVlqoOp+OdwKW59hNTrK94vs1BSSOA0RHRLakTKPVq82Slk8wnEYD29vYqhmZmdu4o\nlUqUSqWTr1tbW2vus9/lrLRktV/S5BS6EXgB2AgsTLFmYEMqbwTmpzuurgCuBJ5OS149kqanC+0L\nerVpTuVbyS7UA2wGZqa7w8YCM1PMzMyGgGrXfO4CvibpbcBLwO3ACGCdpDuAl8nuyCIidktaB+wG\njgJ3xqm1n8XAKmAU2d1eT6T4SmCNpA7gVWB+6us1SfcB28mWy1rTBXYzMxsCqkoiEfEc2W22vd3U\nR/3lwPIK8R3ANRXiR0hJqMKxVWSJx8zMhhg/sW5mZoU5iRTkvbPMzJxEzMysBk4iZmZWmJOImZkV\n5iRiZmaFOYmYmVlhZ3aDqQbmvbPMzDwTMTOzGjiJmJlZYU4iZmZWmJOImZkV5iRiZmaFOYkU5L2z\nzMycRMzMrAZOImZmVpiTiJmZFeYkYmZmhTmJmJlZYVUlEUn7JD0naaekp1NsrKQ2SXslbZY0Jld/\nqaQOSXskzcrFp0naJelFSSty8ZGS1qY2T0malDvWnOrvlbSgPsOuXUR4/ywzO+dVOxN5EyhFxLUR\nMT3FlgBbImIKsBVYCiDpauA2YCowF3hIp+6FfRhYFBGTgcmSZqf4IqA7Iq4CVgAPpL7GAvcA1wEz\ngGX5ZGVmZoOr2iSiCnVvBlan8mrgllSeB6yNiGMRsQ/oAKZLugS4ICK2pXqP5trk+1oP3JDKs4G2\niOiJiNeBNmBOledsZmZnWLVJJIBvSdom6bMpNj4iugAi4hBwcYpPAPbn2nam2ATgQC5+IMXe0iYi\njgM9ksadpi8zMxsCqv08kesj4keS3gW0SdpLlljy6nmBYMCPgre0tJwsl0olmpr8USlmZnnlcply\nuVzXPqt6p42IH6V//0XS/wGmA12SxkdEV1qqOpyqdwKX5ppPTLG+4vk2ByWNAEZHRLekTqDUq82T\nlc4xn0QA2tvbqxmamdk5o1QqUSqVTr5ubW2tuc9+l7MkvV3SO1P5HcAs4HlgI7AwVWsGNqTyRmB+\nuuPqCuBK4Om05NUjaXq60L6gV5vmVL6V7EI9wGZgpqQx6SL7zBQbdN47y8ysupnIeOBxSZHqfy0i\n2iRtB9ZJugN4meyOLCJit6R1wG7gKHBnnLoXdjGwChgFbIqIJ1J8JbBGUgfwKjA/9fWapPuA7WTL\nZa3pAruZmQ0B/SaRiPgh8MEK8W7gpj7aLAeWV4jvAK6pED9CSkIVjq0iSzxmZjbE+Il1MzMrzEnE\nzMwKcxIxM7PC/DBFQd43y8zMMxEzM6uBk4iZmRXmJGJmZoU5iZiZWWFOImZmVpiTSEHeO8vMzEnE\nzMxq4OdEgA0bvsV3v7u733qTJl3IvffefRbOyMxseHASAbq7/5UPf7il33r79vVfx8zsXOLlLDMz\nK8xJxMzMCvNyVkHeO8vMzDMRMzOrgZOImZkV5iRiZmaFVZ1EJJ0n6RlJG9PrsZLaJO2VtFnSmFzd\npZI6JO2RNCsXnyZpl6QXJa3IxUdKWpvaPCVpUu5Yc6q/V9KC2odsZmb1MpCZyOeB/BN5S4AtETEF\n2AosBZB0NXAbMBWYCzykU/uDPAwsiojJwGRJs1N8EdAdEVcBK4AHUl9jgXuA64AZwLJ8sjIzs8FV\nVRKRNBH4JPBXufDNwOpUXg3cksrzgLURcSwi9gEdwHRJlwAXRMS2VO/RXJt8X+uBG1J5NtAWET0R\n8TrQBsypfnhnjvfOMjOrfibyZeCLQP6+1vER0QUQEYeAi1N8ArA/V68zxSYAB3LxAyn2ljYRcRzo\nkTTuNH2ZmdkQ0O9zIpJ+HeiKiGcllU5TtZ4PTgz4T/yWlpaT5VKpRFOTH4ExM8srl8uUy+W69lnN\nO+31wDxJnwTOBy6QtAY4JGl8RHSlparDqX4ncGmu/cQU6yueb3NQ0ghgdER0S+oESr3aPFnpJPNJ\nBKC9vb2KoZmZnTtKpRKlUunk69bW1pr77Hc5KyK+FBGTIuK9wHxga0T8NvA3wMJUrRnYkMobgfnp\njqsrgCuBp9OSV4+k6elC+4JebZpT+VayC/UAm4GZksaki+wzU8zMzIaAWtZ8/hRYJ+kO4GWyO7KI\niN2S1pHdyXUUuDNO7RGyGFgFjAI2RcQTKb4SWCOpA3iVLFkREa9Jug/YTrZc1pousJuZ2RAwoCQS\nEd8Gvp3K3cBNfdRbDiyvEN8BXFMhfoSUhCocW0WWeIYU751lZuYn1s3MrAZOImZmVpiTiJmZFeYk\nYmZmhTmJmJlZYU4iBXnvLDMzJxEzM6uBk4iZmRXmJGJmZoU5iZiZWWFOImZmVpg/dKMg751lZuaZ\niJmZ1cBJxMzMCnMSMTOzwpxEzMysMCcRMzMrzEmkIO+dZWbmJGJmZjXoN4lI+iVJ/yhpp6TnJS1L\n8bGS2iTtlbRZ0phcm6WSOiTtkTQrF58maZekFyWtyMVHSlqb2jwlaVLuWHOqv1fSgvoN3czMatVv\nEomII8C/j4hrgQ8CcyVNB5YAWyJiCrAVWAog6WrgNmAqMBd4SKfWfR4GFkXEZGCypNkpvgjojoir\ngBXAA6mvscA9wHXADGBZPlmZmdngqmo5KyJ+loq/RPaUewA3A6tTfDVwSyrPA9ZGxLGI2Ad0ANMl\nXQJcEBHbUr1Hc23yfa0Hbkjl2UBbRPRExOtAGzBnQCM0M7MzpqokIuk8STuBQ8C3UiIYHxFdABFx\nCLg4VZ8A7M8170yxCcCBXPxAir2lTUQcB3okjTtNX2ZmNgRUtXdWRLwJXCtpNPC4pPeTzUbeUq2O\n5zXg255aWlpOlkulEk1NZ3ZbsNPtnXXPPSt45ZXXq+pn0qQLuffeu+t1WmZmfSqXy5TL5br2OaB3\n2oj4f5LKZEtKXZLGR0RXWqo6nKp1Apfmmk1Msb7i+TYHJY0ARkdEt6ROoNSrzZOVzi2fRADa29sH\nMrS6euWV17n88paq6u7bV109M7NalUolSqXSydetra0191nN3VkXnbiYLel8YCawB9gILEzVmoEN\nqbwRmJ/uuLoCuBJ4Oi159Uiani60L+jVpjmVbyW7UA+wGZgpaUy6yD4zxczMbAioZibybmC1pPPI\nks7/iohNkr4PrJN0B/Ay2R1ZRMRuSeuA3cBR4M44tfazGFgFjAI2RcQTKb4SWCOpA3gVmJ/6ek3S\nfcB2suWy1nSB3czMhoB+k0hEPA9MqxDvBm7qo81yYHmF+A7gmgrxI6QkVOHYKrLEY2ZmQ4yfWDcz\ns8KcRAry3llmZk4iZmZWAycRMzMrzEnEzMwKcxIxM7PCnETMzKywM7vBVAM73d5ZZmbnCs9EzMys\nMM9EBmDnzudYuLClinq7ufzyM346ZmaDzklkAN54I6ranbe9/ZZ+65iZNQIvZ5mZWWFOImZmVpiT\nSEGtraK11Xtnmdm5zUnEzMwKcxIxM7PCnETMzKwwJxEzMyvMScTMzArrN4lImihpq6QXJD0v6a4U\nHyupTdJeSZsljcm1WSqpQ9IeSbNy8WmSdkl6UdKKXHykpLWpzVOSJuWONaf6eyUtqN/Qa7NsWbBs\nmffPMrNzWzUzkWPAFyLi/cBHgcWS3gcsAbZExBRgK7AUQNLVwG3AVGAu8JBOfY7sw8CiiJgMTJY0\nO8UXAd0RcRWwAngg9TUWuAe4DpgBLMsnKzMzG1z9JpGIOBQRz6byT4E9wETgZmB1qrYaOLHXxzxg\nbUQci4h9QAcwXdIlwAURsS3VezTXJt/XeuCGVJ4NtEVET0S8DrQBc4oM1MzM6m9Ae2dJuhz4IPB9\nYHxEdEGWaCRdnKpNAJ7KNetMsWPAgVz8QIqfaLM/9XVcUo+kcfl4r74aRrWbOk6adCH33nv3mT8h\nM7MBqDqJSHon2Szh8xHxU0m9LwjU8wLBgB8Fb2lpOVkulUo0NQ2PvSWr3dRx377+65iZnU65XKZc\nLte1z6reaSU1kSWQNRGxIYW7JI2PiK60VHU4xTuBS3PNJ6ZYX/F8m4OSRgCjI6JbUidQ6tXmyUrn\nmE8iAO3t7dUMzczsnFEqlSiVSidft7a21txntbf4/jWwOyIezMU2AgtTuRnYkIvPT3dcXQFcCTwd\nEYeAHknT04X2Bb3aNKfyrWQX6gE2AzMljUkX2Wem2KDz3llmZlXMRCRdD3waeF7STrJlqy8B9wPr\nJN0BvEx2RxYRsVvSOmA3cBS4M059luxiYBUwCtgUEU+k+EpgjaQO4FVgfurrNUn3AdvT921NF9jN\nzGwI6DeJRMR3gRF9HL6pjzbLgeUV4juAayrEj5CSUIVjq8gSj5mZDTF+Yt3MzApzEjEzs8KcRMzM\nrLDh8TDFEOR9s8zMPBMxM7MaOImYmVlhTiJmZlaYk4iZmRXmJGJmZoU5iRTkvbPMzJxEzMysBk4i\nZmZWmJOImZkV5iRiZmaFOYmYmVlh3jurIO+dZWbmmYiZmdXAScTMzApzEjEzs8L6TSKSVkrqkrQr\nFxsrqU3SXkmbJY3JHVsqqUPSHkmzcvFpknZJelHSilx8pKS1qc1TkibljjWn+nslLajPkM3MrF6q\nmYk8AszuFVsCbImIKcBWYCmApKuB24CpwFzgIUkn9gZ5GFgUEZOByZJO9LkI6I6Iq4AVwAOpr7HA\nPcB1wAxgWT5ZmZnZ4Os3iUREO/Bar/DNwOpUXg3cksrzgLURcSwi9gEdwHRJlwAXRMS2VO/RXJt8\nX+uBG1J5NtAWET0R8TrQBswZwNjOKO+dZWZW/BbfiyOiCyAiDkm6OMUnAE/l6nWm2DHgQC5+IMVP\ntNmf+jouqUfSuHy8V1/npJ07n2PhwpZ+602adCH33nv3mT8hMzPq95xIPR+aKPTnfUtLy8lyqVSi\nqamxHoF5443g8stb+q23b1//dczs3FQulymXy3Xts+g7bZek8RHRlZaqDqd4J3Bprt7EFOsrnm9z\nUNIIYHREdEvqBEq92jzZ1wnlkwhAe3v7AIdkZtbYSqUSpVLp5OvW1taa+6z2Fl/x1hnCRmBhKjcD\nG3Lx+emOqyuAK4GnI+IQ0CNperrQvqBXm+ZUvpXsQj3AZmCmpDHpIvvMFDMzsyGi35mIpK+TzQh+\nWdIrwDLgT4HHJN0BvEx2RxYRsVvSOmA3cBS4MyJOLHUtBlYBo4BNEfFEiq8E1kjqAF4F5qe+XpN0\nH7CdbLmsNV1gNzOzIaLfJBIRv9XHoZv6qL8cWF4hvgO4pkL8CCkJVTi2iizxDDneO8vMzE+sm5lZ\nDRrrFibzrcBmdlY5iTQY3wpsZmeTl7PMzKwwJxEzMyvMSaQg751lZuYkYmZmNXASMTOzwpxEzMys\nMCcRMzMrzEnEzMwK88OGBQ33vbP8ZLuZ1YOTyDnKT7abWT14OcvMzApzEjEzs8KcRMzMrDAnETMz\nK8xJpCDvnWVmNkzuzpI0B1hBlvRWRsT9g3xK5wzfCmxmpzPkk4ik84C/AG4EDgLbJG2IiB8M7pmd\nPfv2lQfte5+NW4HL5TKlUqlw+6HO4xu+Gnls9TLkkwgwHeiIiJcBJK0FbgacRIaQWmYsjf4/qsc3\nfDXy2OplOCSRCcD+3OsDZInFhhA/vGh2bhoOSaSQ8847j1/8Yjf79x8+bb1f/OInnHeeL5CfLZVm\nLM8+W66YXF56aS/vfe+Ufvustp6v25jVnyKG9h5Qkj4CtETEnPR6CRD5i+uShvYgzMyGqIio6a/o\n4ZBERgB7yS6s/wh4GvhUROwZ1BMzM7Ohv5wVEccl/WegjVO3+DqBmJkNAUN+JmJmZkPXsH9iXdIc\nST+Q9KKkPxzs86mVpImStkp6QdLzku5K8bGS2iTtlbRZ0pjBPteiJJ0n6RlJG9PrRhrbGEmPSdqT\nfoczGmx8fyDpnyTtkvQ1SSOH8/gkrZTUJWlXLtbneCQtldSRfr+zBuesq9fH+B5I5/+spG9KGp07\nNuDxDeskknsQcTbwfuBTkt43uGdVs2PAFyLi/cBHgcVpTEuALRExBdgKLB3Ec6zV54HdudeNNLYH\ngU0RMRX4ANnzTA0xPknvAT4HTIuIXyFbDv8Uw3t8j5C9f+RVHI+kq4HbgKnAXOAhSUP91s5K42sD\n3h8RHwQ6qHF8wzqJkHsQMSKOAiceRBy2IuJQRDybyj8F9gATyca1OlVbDdwyOGdYG0kTgU8Cf5UL\nN8rYRgMfi4hHACLiWET00CDjS0YA75DUBJwPdDKMxxcR7cBrvcJ9jWcesDb9XveRvQEP6WfWKo0v\nIrZExJvp5ffJ3l+g4PiGexKp9CDihEE6l7qTdDnwQbJf9PiI6IIs0QAXD96Z1eTLwBeB/MW4Rhnb\nFcCPJT2Sluu+KuntNMj4IuIg8OfAK2TJoycittAg48u5uI/x9H6/6WT4v9/cAWxK5ULjG+5JpGFJ\neiewHvh8mpH0vgNi2N0RIenXga400zrdNHnYjS1pAqYB/z0ipgFvkC2NDPvfHYCkC8n+Sr8MeA/Z\njOTTNMj4TqPRxgOApP8KHI2Ib9TSz3BPIp3ApNzriSk2rKWlgvXAmojYkMJdksan45cAp38Uf2i6\nHpgn6SXgG8ANktYAhxpgbJDNhPdHxPb0+ptkSaURfncANwEvRUR3RBwHHgd+lcYZ3wl9jacTuDRX\nb9i+30haSLas/Fu5cKHxDfcksg24UtJlkkYC84GNg3xO9fDXwO6IeDAX2wgsTOVmYEPvRkNdRHwp\nIiZFxHvJfldbI+K3gb9hmI8NIC2B7Jc0OYVuBF6gAX53ySvARySNShdcbyS7QWK4j0+8dWbc13g2\nAvPTHWlXAFeSPfw81L1lfOmjNb4IzIuII7l6xcYXEcP6C5hD9kR7B7BksM+nDuO5HjgOPAvsBJ5J\nYxwHbEljbQMuHOxzrXGcnwA2pnLDjI3sjqxt6ff3v4ExDTa+ZWQ3e+wiu+j8tuE8PuDrZB8xcYQs\nSd4OjO1rPGR3Mv1z+hnMGuzzLzi+DuDl9N7yDPBQLePzw4ZmZlbYcF/OMjOzQeQkYmZmhTmJmJlZ\nYU4iZmZWmJOImZkV5iRiZmaFOYlYQ5L0J5I+Ienms/ERAZJ+KGlcHfubImmnpB3pwa+BtP2EpI/W\n61zMTsdJxBrVDOAfyR5q/Id6dpw+srm3ej9wdQvwWER8KCJ+OMC2JbLtSMzOOCcRayjpA3eeAz4M\nfA/4LPCwpD+S9C5J21O9D0h6M21Nj6R/Ttt5XCbp79MH9nwrd/wRSQ9L+j5wv6Rx6QOLnpf0l6Rt\nJSS9XdLfplnELkm39jq/EZKelvTx9Hq5pPt61ZkL3A38vqS/T7HHJW1L3++zubpz0mxlZzrfy4Df\nA+5OOwlfL+k/pnY7JZXr/kO3c9tgP5bvL3/V+4ssgTxI9tkX3+l17HngncBispnKp8g28fxuOr4R\n+Ewq3w48nsqPkLZpSa8fBP4olT9JtlXNOOA3gf+Rq3dBhfO7mmxPrRuBHUBThTrLyD6c7MTrC9O/\no9IYxgIXkW1lMalXnd5tdwHvTuXRg/378VdjfXkmYo1oGtkb51SyTxbM+x7wa8DHgT8hW+76GPCd\ndPyjZDsMA6wh28vshMdy5Y8D/xMgIjZx6oN/ngdmphnGr0XET3qfXETsTm3/Frg9Io5VMaa7JT3L\nqQ8Rugr4CPDtiHgl9ft6H23bgdVpBtNUxfcyq5r/g7KGIekDwCqyN9l/Ad6R4s8AH41sx9LvkCWN\nSRGxQdIS4E3g71I3p7u28Uau3LueACKiQ9I0stnJH0vaEhF/XKGva8gSz/gqxvUJ4AZgRkQckfQk\n2Yzk5Pc9nYi4U9J1wG8AOyRNi4jen+ZnVohnItYwIuK5iLgW2BsRV5N9PvasiJgWp7a8/g7wGbKd\nTAG6yd7w29Pr75EtcZHqnZih9PYPwKfh5DWMC1P53cDPI+LrwJ+RzYreQtJvki1HfRz4i/Sxuqcz\nBngtJZD3kc1AIJuVfCxdB0HS2BT/CXCyT0nvjYhtEbGM7LMx8p8ZYVYTz0SsoUi6iFNLS1MiYm/+\neES8nH0UBt9OoXZgQmSfhQ5wF/CIpP9CNpu5/UTTXt/qXuAbkuaTJZ5XUvwa4M8kvQn8K/D7vc7v\nl8mW0W6IiIOSvkJ2feV2+vYE8HuSXiDbnvypNJYfS/pd4PH0+R6Hgdlkn8+yXtI84HPAFyRdlfra\nEhG7TvM16cuWAAAAPklEQVS9zAbEW8GbmVlhXs4yM7PCnETMzKwwJxEzMyvMScTMzApzEjEzs8Kc\nRMzMrDAnETMzK8xJxMzMCvv/iByZnb0a3PEAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x7fafc871f1d0>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "lens = map(len, (x for h,_ in train_facts + test_facts for x in h))\n",
+    "print lens[0]\n",
+    "print stats.describe(lens)\n",
+    "plt.xlabel('#words x facts')\n",
+    "plt.hist(lens, bins=30,alpha=0.5)\n",
+    "plt.axvline(np.array(lens).mean(), color='black', linestyle='dashed', linewidth=2)\n",
+    "plt.savefig('plots/word_by_fact.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "#print story_vocab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}