Disease-Detection-NLP / Git / [0eda78] /data_description/data

Models:
philipB/
Disease-Detection-NLP
Downloads: 1
[0eda78]: / data_description / data_descriptor.py
History
Download this file
107 lines (79 with data), 3.2 kB

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


def preprocess_data(text, remove_stopwords = True):
    nltk.download('punkt')
    nltk.download('stopwords')

    tokens = word_tokenize(text)

    if remove_stopwords:
        tokens = [word.lower() for word in tokens if word.isalpha()]
        tokens = [word for word in tokens if word not in stopwords.words('english')]

    return tokens

def generate_wordcloud(tokens):
    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    stopwords = stopwords.words('english'),
                    min_font_size = 10).generate(' '.join(tokens))

    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()

def generate_most_common(tokens):
    freq_dist = Counter(tokens)
    most_common = freq_dist.most_common(20)

    plt.figure(figsize=(12, 8))

    sns.barplot(x=[val[1] for val in most_common], y=[val[0] for val in most_common])
    plt.show()
    
def describe_cvs():
    data = pd.read_csv('../datasets/labelled_data/all.csv', names=['text', 'entity'], header=None, sep="|")
    print(f"Data loaded into dataframe:\n\n{data.head(10)}\n\n")

    unique_tags = data['entity'].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0)
    print(f"Entities in the data:\n\n{unique_tags}\n\n")

    all_tokens = data['entity'].apply(lambda x: len(x.split(" "))).sum(axis = 0)
    print(f"All tokens in ConLL file: {all_tokens}")

    #sent_len = data['text'].apply(len)
    sent_len = data['entity'].apply(lambda x: len(x.split(" ")))
    longest_sentence = sent_len.max()
    shortest_sentence = sent_len.min()
    median_sentence = sent_len.median()
    mean_sentece = sent_len.mean()
    print("Sentence Length Statistics:\n")
    print(f"min: {shortest_sentence}")
    print(f"max: {longest_sentence}")
    print(f"median: {median_sentence}")
    print(f"mean: {mean_sentece}")

    plt.figure(figsize=(10, 6))
    sent_len.plot(kind='hist', bins=20, color='skyblue', edgecolor='black')
    plt.title('Histogram of Sentence Lengths')
    plt.xlabel('Length of sentence')
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)
    plt.show()

    sns.set(style="whitegrid")

    palette = sns.color_palette("coolwarm", 7)

    plt.figure(figsize=(15, 6))
    sns.boxplot(sent_len, color=palette[3], saturation=0.75, orient="h")

    plt.title('Boxplot of Sentence Lengths', fontsize=16)
    plt.xlabel('Sentence Length', fontsize=14)
    plt.ylabel('Density', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks([])

    plt.show()


with open('entities.txt', 'r') as file:
    entities_tokens = file.readlines() #no preprocessing needed

with open('text.txt', 'r') as file:
    text = file.read()

text_tokens = preprocess_data(text)

generate_wordcloud(text_tokens)
generate_most_common(text_tokens)

generate_wordcloud(entities_tokens)
generate_most_common(entities_tokens)

describe_cvs()