--- a +++ b/ClassifyText.ipynb @@ -0,0 +1,252 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import scipy.sparse as sp\n", + "from numpy.linalg import norm\n", + "from collections import Counter, defaultdict\n", + "from scipy.sparse import csr_matrix\n", + "import nltk\n", + "import string\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from collections import Counter\n", + "import re\n", + "from nltk.corpus import stopwords\n", + "from string import digits\n", + "#nltk.download(\"wordnet\")\n", + "from nltk.stem import PorterStemmer, WordNetLemmatizer\n", + "from collections import Counter\n", + "from scipy.sparse import csr_matrix\n", + "from sklearn import model_selection\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", + "from sklearn.svm import SVC\n", + "from sklearn.metrics import confusion_matrix, classification_report\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['catheterization laboratory events and hospital outcome with direct angioplasty for acute myocardial infarction to assess the safety of direct infarct angioplasty without antecedent thrombolytic therapy, catheterization laboratory and hospital events were assessed in consecutively treated patients with infarctions involving the left anterior descending (n = 100 patients), right (n = 100), and circumflex (n = 50) coronary arteries. the groups of patients were similar for age (left anterior descending coronary artery, 59 years; right coronary artery, 58 years; circumflex coronary artery, 62 years), patients with multivessel disease (left anterior descending coronary artery, 55%; right coronary artery, 55%; circumflex coronary artery, 64%), and patients with initial grade 0/1 antegrade flow (left anterior descending coronary artery, 79%; right coronary artery, 84%; circumflex coronary artery, 90%). cardiogenic shock was present in eight patients with infarction of the left anterior descending coronary artery, four with infarction of the right coronary artery, and four with infarction of the circumflex coronary artery. major catheterization laboratory events (cardioversion, cardiopulmonary resuscitation, dopamine or intra-aortic balloon pump support for hypotension, and urgent surgery) occurred in 10 patients with infarction of the left anterior descending coronary artery, eight with infarction of the right coronary artery, and four with infarction of the circumflex coronary artery (16 of 16 shock and six of 234 nonshock patients, p less than 0.001). there was one in-laboratory death (shock patient with infarction of the left anterior descending coronary artery). \\n', 'renal abscess in children. three cases of renal abscesses in children are described to illustrate the variable presenting features. an additional 23 pediatric cases, reported over the past ten years, were reviewed for clinical features and therapy. fever, loin pain, and leukocytosis were common presenting features, but less than half of all abscesses were associated with either an abnormal urinalysis or a positive urine culture. the presenting features were sometimes confused with appendicitis, peritonitis, or a wilms tumor. an organism was identified in 17 cases--escherichia coli in 9 children and staphylococcus aureus in 8 children. the majority of e. coli infections occurred in girls and the majority of s. aureus infections occurred in boys. reflux was documented in 5 patients, and 2 children had a possible extrarenal source of infection. antibiotics alone produced a cure in 10 children (38%), but 16 children (62%) required a surgical procedure. \\n']\n", + "['4', '5']\n" + ] + } + ], + "source": [ + "# open docs file and read its lines\n", + "with open(\"train.dat\", \"r\") as fh:\n", + " lines = fh.readlines() \n", + " \n", + "train_arr = []\n", + "train_lbl = []\n", + "\n", + "for i in range(len(lines)):\n", + " train_arr.append(lines[i].lower()[2:])\n", + " train_lbl.append(lines[i][0:1])\n", + "\n", + "print train_arr[0:2]\n", + "print train_lbl[0:2]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['catheterization laboratory events hospital outcome direct angioplasty acute myocardial infarction assess safety direct infarct angioplasty without antecedent thrombolytic therapy, catheterization laboratory hospital events assessed consecutively treated patients infarctions involving left anterior descending patients), right ), circumflex coronary arteries. groups patients similar (left anterior descending coronary artery, years; right coronary artery, years; circumflex coronary artery, years), patients multivessel disease (left anterior descending coronary artery, %; right coronary artery, %; circumflex coronary artery, %), patients initial grade antegrade flow (left anterior descending coronary artery, %; right coronary artery, %; circumflex coronary artery, %). cardiogenic shock present eight patients infarction left anterior descending coronary artery, four infarction right coronary artery, four infarction circumflex coronary artery. major catheterization laboratory events (cardioversion, cardiopulmonary resuscitation, dopamine intra-aortic balloon pump support hypotension, urgent surgery) occurred patients infarction left anterior descending coronary artery, eight infarction right coronary artery, four infarction circumflex coronary artery shock nonshock patients, less .). -laboratory death (shock patient infarction left anterior descending coronary artery).', 'renal abscess children. three cases renal abscesses children described illustrate variable presenting features. additional pediatric cases, reported past years, reviewed clinical features therapy. fever, loin pain, leukocytosis common presenting features, less half abscesses associated either abnormal urinalysis positive urine culture. presenting features sometimes confused appendicitis, peritonitis, wilms tumor. organism identified cases--escherichia coli children staphylococcus aureus children. majority coli infections occurred girls majority aureus infections occurred boys. reflux documented patients, children possible extrarenal source infection. antibiotics alone produced cure children (%), children (%) required surgical procedure.']\n" + ] + } + ], + "source": [ + "#remove short words\n", + "def filterLen(tdocs, minlen):\n", + " return [ ' '.join(t for t in d if len(t) >= minlen ) for d in tdocs ]\n", + "\n", + "trainDocs = [l.split() for l in train_arr]\n", + "train_filtered = filterLen(trainDocs,4)\n", + "\n", + "#lemmatize and remove stop words\n", + "def preProcess(data_arr):\n", + " lemmatiser = WordNetLemmatizer()\n", + " pattern = re.compile(r'\\b(' + r'|'.join(stopwords.words('english')) + r')\\b\\s*')\n", + " #print train_arr[0:10]\n", + " for i in range(len(train_arr)):\n", + " data_arr[i] = pattern.sub('', data_arr[i])\n", + " data_arr[i] = data_arr[i].translate(None, digits)\n", + " #data_arr[i] = nltk.word_tokenize(lemmatiser.lemmatize(data_arr[i],pos=\"v\").translate(None, string.punctuation))\n", + " return data_arr\n", + "\n", + "train_arr = preProcess(train_filtered)\n", + "print train_arr[0:2]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(14438, 34727)\n" + ] + } + ], + "source": [ + "#create TDIF matrix\n", + "vectorizer = CountVectorizer()\n", + "tfidf = TfidfTransformer()\n", + "vectorizer.fit(train_arr)\n", + "x_train_arr = vectorizer.transform(train_arr)\n", + "tfidf.fit(x_train_arr)\n", + "x_train = tfidf.transform(x_train_arr)\n", + "print x_train.shape\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# open docs file and read its lines - for test mode\n", + "# with open(\"test.dat\", \"r\") as fh:\n", + "# linesTest = fh.readlines() \n", + " \n", + "# for i in range(len(linesTest)):\n", + "# test_arr.append(linesTest[i].lower())\n", + " \n", + "# test_arr = preProcess(test_arr)\n", + "\n", + "# vectorizer.fit(test_arr)\n", + "# x_test_arr = vectorizer.transform(test_arr)\n", + "# tfidf.fit(x_test_arr)\n", + "# x_test = tfidf.transform(x_test_arr)\n", + "\n", + "# print x_train.shape\n", + "# print len(train_lbl)\n", + "# print x_test.shape\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(10828, 34727)\n", + "(3610, 34727)\n", + "(10828,)\n", + "(3610,)\n" + ] + } + ], + "source": [ + "#split to train and test\n", + "train_lbl = np.asarray(train_lbl)\n", + "X_train, X_test, Y_train, Y_test = model_selection.train_test_split(x_train, train_lbl, test_size=0.25, random_state=42)\n", + "print X_train.shape\n", + "print X_test.shape\n", + "\n", + "print Y_train.shape\n", + "print Y_test.shape\n" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "# from sklearn.feature_selection import SelectKBest,mutual_info_classif,f_classif,VarianceThreshold,chi2,SelectPercentile,SelectFdr\n", + "# featureSelector1 = SelectKBest(f_classif, k=200)\n", + "# X_new = featureSelector1.fit_transform(X_train,Y_train)\n", + "# print X_new.shape\n", + "\n", + "# X_test_new = featureSelector1.transform(X_test)\n", + "# print X_test_new.shape\n", + "\n", + "X_new = X_train\n", + "X_test_new = X_test" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", + "from sklearn.linear_model import SGDClassifier, LogisticRegression, Perceptron\n", + "from sklearn.svm import LinearSVC\n", + "#from sklearn.neural_network import MLPClassifier\n", + "kfold = model_selection.StratifiedKFold(n_splits=5)\n", + "#model = ExtraTreesClassifier(random_state=42, class_weight ='balanced_subsample', n_estimators=200,min_samples_split = 5)\n", + "model = SGDClassifier(loss='hinge', penalty='l2',alpha=0.0004, max_iter=100, random_state=42)\n", + "#model = LogisticRegression(class_weight='balanced', C=0.1)\n", + "#model = MLPClassifier(max_iter=18)\n", + "#model = AdaBoostClassifier(LinearSVC(penalty='l1', dual=False,tol=1e-3, class_weight='balanced',C=0.1),algorithm='SAMME')\n", + "\n", + "model.fit(X_new,Y_train)\n", + "predict = model.predict(X_test_new)\n", + "#Cross Validate\n", + "#results = model_selection.cross_val_score(model, x_train, train_lbl, cv=kfold, scoring='f1_micro')\n", + "#print(results.mean())\n", + "\n", + "print '\\n clasification report:\\n', classification_report(Y_test,predict)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}