Diff of /code.txt [000000] .. [610063]

Switch to unified view

a b/code.txt
1
# Import necessary libraries
2
import pandas as pd
3
import numpy as np
4
from sklearn.model_selection import train_test_split
5
from sklearn.feature_extraction.text import TfidfVectorizer
6
from sklearn.ensemble import RandomForestClassifier
7
from sklearn.metrics import classification_report
8
import spacy
9
import matplotlib.pyplot as plt
10
import seaborn as sns
11
12
# Load the medical transcription dataset
13
df_medical = pd.read_csv("mtsamples.csv")
14
15
# Display basic information about the dataset
16
print("Dataset Structure:")
17
print(df_medical.info())
18
19
# Display basic statistics about the numerical columns
20
print("\nDataset Statistics:")
21
print(df_medical.describe())
22
23
# Display the unique values in the 'medical_specialty' column
24
print("\nMedical Specialties:")
25
print(df_medical['medical_specialty'].unique())
26
27
# Text cleaning
28
df_medical['cleaned_text'] = df_medical['transcription'].apply(lambda x: ' '.join([word.lower() for word in str(x).split() if word.isalnum()]))
29
30
# Handling missing values and duplicates
31
df_medical.dropna(subset=['cleaned_text'], inplace=True)
32
df_medical.drop_duplicates(subset='cleaned_text', inplace=True)
33
34
# Splitting the data into training and validation sets
35
X_train, X_val, y_train, y_val = train_test_split(df_medical['cleaned_text'], df_medical['medical_specialty'], test_size=0.2, random_state=42)
36
37
# Feature extraction using TF-IDF
38
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
39
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
40
X_val_tfidf = tfidf_vectorizer.transform(X_val)
41
42
# Train/Fine-tune the model
43
clf = RandomForestClassifier()
44
clf.fit(X_train_tfidf, y_train)
45
46
# Evaluate the model
47
y_pred = clf.predict(X_val_tfidf)
48
print("Classification Report:")
49
print(classification_report(y_val, y_pred))
50
51
# Incorporate a language model (spaCy) for tokenization
52
nlp = spacy.load("en_core_web_sm")
53
54
# Tokenize text using spaCy
55
def tokenize_text(text):
56
    doc = nlp(text)
57
    return [token.text for token in doc]
58
59
# Apply tokenization to 'transcription' column
60
df_medical['tokens'] = df_medical['transcription'].apply(tokenize_text)
61
62
# Exploratory Data Analysis (EDA)
63
# Visualize the distribution of medical specialties
64
plt.figure(figsize=(12, 6))
65
sns.countplot(y='medical_specialty', data=df_medical)
66
plt.title('Distribution of Medical Specialties')
67
plt.xlabel('Count')
68
plt.show()
69
70
# Visualize the most common words in the dataset
71
common_words = pd.Series(' '.join(df_medical['cleaned_text']).split()).value_counts()[:10]
72
common_words.plot(kind='bar', figsize=(12, 6))
73
plt.title('Top 10 Most Common Words in Transcriptions')
74
plt.xlabel('Words')
75
plt.ylabel('Frequency')
76
plt.show()
77
78
# Visualize the confusion matrix
79
conf_matrix = pd.crosstab(y_val, y_pred, rownames=['Actual'], colnames=['Predicted'])
80
plt.figure(figsize=(12, 8))
81
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
82
plt.title('Confusion Matrix')
83
plt.show()