[0eda78]: / data_description / data_descriptor.py

Download this file

107 lines (79 with data), 3.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
def preprocess_data(text, remove_stopwords = True):
nltk.download('punkt')
nltk.download('stopwords')
tokens = word_tokenize(text)
if remove_stopwords:
tokens = [word.lower() for word in tokens if word.isalpha()]
tokens = [word for word in tokens if word not in stopwords.words('english')]
return tokens
def generate_wordcloud(tokens):
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords.words('english'),
min_font_size = 10).generate(' '.join(tokens))
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
def generate_most_common(tokens):
freq_dist = Counter(tokens)
most_common = freq_dist.most_common(20)
plt.figure(figsize=(12, 8))
sns.barplot(x=[val[1] for val in most_common], y=[val[0] for val in most_common])
plt.show()
def describe_cvs():
data = pd.read_csv('../datasets/labelled_data/all.csv', names=['text', 'entity'], header=None, sep="|")
print(f"Data loaded into dataframe:\n\n{data.head(10)}\n\n")
unique_tags = data['entity'].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0)
print(f"Entities in the data:\n\n{unique_tags}\n\n")
all_tokens = data['entity'].apply(lambda x: len(x.split(" "))).sum(axis = 0)
print(f"All tokens in ConLL file: {all_tokens}")
#sent_len = data['text'].apply(len)
sent_len = data['entity'].apply(lambda x: len(x.split(" ")))
longest_sentence = sent_len.max()
shortest_sentence = sent_len.min()
median_sentence = sent_len.median()
mean_sentece = sent_len.mean()
print("Sentence Length Statistics:\n")
print(f"min: {shortest_sentence}")
print(f"max: {longest_sentence}")
print(f"median: {median_sentence}")
print(f"mean: {mean_sentece}")
plt.figure(figsize=(10, 6))
sent_len.plot(kind='hist', bins=20, color='skyblue', edgecolor='black')
plt.title('Histogram of Sentence Lengths')
plt.xlabel('Length of sentence')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.show()
sns.set(style="whitegrid")
palette = sns.color_palette("coolwarm", 7)
plt.figure(figsize=(15, 6))
sns.boxplot(sent_len, color=palette[3], saturation=0.75, orient="h")
plt.title('Boxplot of Sentence Lengths', fontsize=16)
plt.xlabel('Sentence Length', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks([])
plt.show()
with open('entities.txt', 'r') as file:
entities_tokens = file.readlines() #no preprocessing needed
with open('text.txt', 'r') as file:
text = file.read()
text_tokens = preprocess_data(text)
generate_wordcloud(text_tokens)
generate_most_common(text_tokens)
generate_wordcloud(entities_tokens)
generate_most_common(entities_tokens)
describe_cvs()