[0eda78]: / data_description / describe_entitiy_location.py

Download this file

65 lines (50 with data), 2.2 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import argparse
parser = argparse.ArgumentParser(
description='This script is used to describe the relative positions of entities in the text.')
parser.add_argument('-i', '--input', type=str, required=True,
help='Choose the input CSV file.')
args = parser.parse_args()
if not args.input.endswith('.csv'):
raise ValueError('Input file needs to be defined as a CSV-file')
df = pd.read_csv(args.input, sep='|', header=None, names=['Text', 'Annotations'])
df['Annotations'] = df['Annotations'].str.strip().str.split(' ')
def extract_entities(annotations):
return [(i, tag) for i, tag in enumerate(annotations) if tag not in ['O']]
df['EntityPositions'] = df['Annotations'].apply(extract_entities)
def calculate_relative_positions(dataframe):
relative_positions = []
for _, row in dataframe.iterrows():
annotations = row['Annotations']
positions = [i for i, tag in enumerate(annotations) if tag not in ['O']]
total_length = len(annotations)
relative_positions.extend([pos / total_length for pos in positions])
return relative_positions
relative_positions = calculate_relative_positions(df)
plt.figure(figsize=(12, 6))
sns.kdeplot(relative_positions, bw_adjust=0.5, fill=True)
plt.xlabel("Relative Word Position")
plt.ylabel("Density")
plt.title("Density Plot of Entities Across Relative Word Positions")
plt.grid(True)
plt.show()
plt.figure(figsize=(12, 6))
plt.hist(relative_positions, bins=20, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel("Relative Word Position")
plt.ylabel("Frequency")
plt.title("Distribution of Entities Across Relative Word Positions")
plt.grid(axis='y')
plt.show()
bin_edges = np.linspace(0, 1, 21) # 20 bins from 0% to 100%
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 # Calculate bin centers
hist, _ = np.histogram(relative_positions, bins=bin_edges)
plt.figure(figsize=(12, 6))
plt.plot(bin_centers, hist, marker='o', linestyle='-', color='purple')
plt.xlabel("Relative Word Position")
plt.ylabel("Frequency")
plt.title("Distribution Graph of Entities Across Relative Word Positions")
plt.grid(True)
plt.show()