|
a |
|
b/code.txt |
|
|
1 |
# Import necessary libraries |
|
|
2 |
import pandas as pd |
|
|
3 |
import numpy as np |
|
|
4 |
from sklearn.model_selection import train_test_split |
|
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
6 |
from sklearn.ensemble import RandomForestClassifier |
|
|
7 |
from sklearn.metrics import classification_report |
|
|
8 |
import spacy |
|
|
9 |
import matplotlib.pyplot as plt |
|
|
10 |
import seaborn as sns |
|
|
11 |
|
|
|
12 |
# Load the medical transcription dataset |
|
|
13 |
df_medical = pd.read_csv("mtsamples.csv") |
|
|
14 |
|
|
|
15 |
# Display basic information about the dataset |
|
|
16 |
print("Dataset Structure:") |
|
|
17 |
print(df_medical.info()) |
|
|
18 |
|
|
|
19 |
# Display basic statistics about the numerical columns |
|
|
20 |
print("\nDataset Statistics:") |
|
|
21 |
print(df_medical.describe()) |
|
|
22 |
|
|
|
23 |
# Display the unique values in the 'medical_specialty' column |
|
|
24 |
print("\nMedical Specialties:") |
|
|
25 |
print(df_medical['medical_specialty'].unique()) |
|
|
26 |
|
|
|
27 |
# Text cleaning |
|
|
28 |
df_medical['cleaned_text'] = df_medical['transcription'].apply(lambda x: ' '.join([word.lower() for word in str(x).split() if word.isalnum()])) |
|
|
29 |
|
|
|
30 |
# Handling missing values and duplicates |
|
|
31 |
df_medical.dropna(subset=['cleaned_text'], inplace=True) |
|
|
32 |
df_medical.drop_duplicates(subset='cleaned_text', inplace=True) |
|
|
33 |
|
|
|
34 |
# Splitting the data into training and validation sets |
|
|
35 |
X_train, X_val, y_train, y_val = train_test_split(df_medical['cleaned_text'], df_medical['medical_specialty'], test_size=0.2, random_state=42) |
|
|
36 |
|
|
|
37 |
# Feature extraction using TF-IDF |
|
|
38 |
tfidf_vectorizer = TfidfVectorizer(max_features=5000) |
|
|
39 |
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) |
|
|
40 |
X_val_tfidf = tfidf_vectorizer.transform(X_val) |
|
|
41 |
|
|
|
42 |
# Train/Fine-tune the model |
|
|
43 |
clf = RandomForestClassifier() |
|
|
44 |
clf.fit(X_train_tfidf, y_train) |
|
|
45 |
|
|
|
46 |
# Evaluate the model |
|
|
47 |
y_pred = clf.predict(X_val_tfidf) |
|
|
48 |
print("Classification Report:") |
|
|
49 |
print(classification_report(y_val, y_pred)) |
|
|
50 |
|
|
|
51 |
# Incorporate a language model (spaCy) for tokenization |
|
|
52 |
nlp = spacy.load("en_core_web_sm") |
|
|
53 |
|
|
|
54 |
# Tokenize text using spaCy |
|
|
55 |
def tokenize_text(text): |
|
|
56 |
doc = nlp(text) |
|
|
57 |
return [token.text for token in doc] |
|
|
58 |
|
|
|
59 |
# Apply tokenization to 'transcription' column |
|
|
60 |
df_medical['tokens'] = df_medical['transcription'].apply(tokenize_text) |
|
|
61 |
|
|
|
62 |
# Exploratory Data Analysis (EDA) |
|
|
63 |
# Visualize the distribution of medical specialties |
|
|
64 |
plt.figure(figsize=(12, 6)) |
|
|
65 |
sns.countplot(y='medical_specialty', data=df_medical) |
|
|
66 |
plt.title('Distribution of Medical Specialties') |
|
|
67 |
plt.xlabel('Count') |
|
|
68 |
plt.show() |
|
|
69 |
|
|
|
70 |
# Visualize the most common words in the dataset |
|
|
71 |
common_words = pd.Series(' '.join(df_medical['cleaned_text']).split()).value_counts()[:10] |
|
|
72 |
common_words.plot(kind='bar', figsize=(12, 6)) |
|
|
73 |
plt.title('Top 10 Most Common Words in Transcriptions') |
|
|
74 |
plt.xlabel('Words') |
|
|
75 |
plt.ylabel('Frequency') |
|
|
76 |
plt.show() |
|
|
77 |
|
|
|
78 |
# Visualize the confusion matrix |
|
|
79 |
conf_matrix = pd.crosstab(y_val, y_pred, rownames=['Actual'], colnames=['Predicted']) |
|
|
80 |
plt.figure(figsize=(12, 8)) |
|
|
81 |
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues') |
|
|
82 |
plt.title('Confusion Matrix') |
|
|
83 |
plt.show() |