--- a +++ b/data_description/data_descriptor.py @@ -0,0 +1,106 @@ +from wordcloud import WordCloud +import matplotlib.pyplot as plt +import seaborn as sns +from collections import Counter +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + + +def preprocess_data(text, remove_stopwords = True): + nltk.download('punkt') + nltk.download('stopwords') + + tokens = word_tokenize(text) + + if remove_stopwords: + tokens = [word.lower() for word in tokens if word.isalpha()] + tokens = [word for word in tokens if word not in stopwords.words('english')] + + return tokens + +def generate_wordcloud(tokens): + wordcloud = WordCloud(width = 800, height = 800, + background_color ='white', + stopwords = stopwords.words('english'), + min_font_size = 10).generate(' '.join(tokens)) + + plt.figure(figsize = (8, 8), facecolor = None) + plt.imshow(wordcloud) + plt.axis("off") + plt.tight_layout(pad = 0) + plt.show() + +def generate_most_common(tokens): + freq_dist = Counter(tokens) + most_common = freq_dist.most_common(20) + + plt.figure(figsize=(12, 8)) + + sns.barplot(x=[val[1] for val in most_common], y=[val[0] for val in most_common]) + plt.show() + +def describe_cvs(): + data = pd.read_csv('../datasets/labelled_data/all.csv', names=['text', 'entity'], header=None, sep="|") + print(f"Data loaded into dataframe:\n\n{data.head(10)}\n\n") + + unique_tags = data['entity'].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0) + print(f"Entities in the data:\n\n{unique_tags}\n\n") + + all_tokens = data['entity'].apply(lambda x: len(x.split(" "))).sum(axis = 0) + print(f"All tokens in ConLL file: {all_tokens}") + + #sent_len = data['text'].apply(len) + sent_len = data['entity'].apply(lambda x: len(x.split(" "))) + longest_sentence = sent_len.max() + shortest_sentence = sent_len.min() + median_sentence = sent_len.median() + mean_sentece = sent_len.mean() + print("Sentence Length Statistics:\n") + print(f"min: {shortest_sentence}") + print(f"max: {longest_sentence}") + print(f"median: {median_sentence}") + print(f"mean: {mean_sentece}") + + plt.figure(figsize=(10, 6)) + sent_len.plot(kind='hist', bins=20, color='skyblue', edgecolor='black') + plt.title('Histogram of Sentence Lengths') + plt.xlabel('Length of sentence') + plt.ylabel('Frequency') + plt.grid(axis='y', alpha=0.75) + plt.show() + + sns.set(style="whitegrid") + + palette = sns.color_palette("coolwarm", 7) + + plt.figure(figsize=(15, 6)) + sns.boxplot(sent_len, color=palette[3], saturation=0.75, orient="h") + + plt.title('Boxplot of Sentence Lengths', fontsize=16) + plt.xlabel('Sentence Length', fontsize=14) + plt.ylabel('Density', fontsize=14) + plt.xticks(fontsize=12) + plt.yticks([]) + + plt.show() + + +with open('entities.txt', 'r') as file: + entities_tokens = file.readlines() #no preprocessing needed + +with open('text.txt', 'r') as file: + text = file.read() + +text_tokens = preprocess_data(text) + +generate_wordcloud(text_tokens) +generate_most_common(text_tokens) + +generate_wordcloud(entities_tokens) +generate_most_common(entities_tokens) + +describe_cvs()