medaCy / Git / [6c353a] /medacy/tools/calculators/lexical

Models:

philipB/

medaCy

Downloads: 1

[6c353a]: / medacy / tools / calculators / lexical_variation.py

History

Download this file

53 lines (36 with data), 1.6 kB

"""
A command-line tool for creating tabular data regarding the lexical variation
of a given dataset.

python -m medacy.tools.calculators.lexical_variation --help

The output of this tool is compatible with the tabulate module
"""

import argparse

import tabulate

from medacy.data.dataset import Dataset


def calculate_unique_mentions(dataset):
    """
    Creates a dictionary of sets of unique mentions for each tag in a dataset
    :param dataset: A Dataset object
    :return: A dictionary mapping tags (str) to a set of mentions (set)
    """
    labels = dataset.get_labels()
    unique_mentions = {t: set() for t in labels}

    for ann in dataset.generate_annotations():
        for ent in ann:
            tag, start, end, text = ent
            unique_mentions[tag].add(text)

    return unique_mentions


def main():
    parser = argparse.ArgumentParser(description="Calculate the lexical variation in a given dataset")
    parser.add_argument('dataset', help="Path to the dataset directory")
    parser.add_argument('-f', '--format', help="Format to print the table (options include grid, github, and latex)")
    args = parser.parse_args()

    data = Dataset(args.dataset)
    unique_mention_dict = calculate_unique_mentions(data)
    tag_counts = data.compute_counts()

    table = [['Tag', 'Unique Mentions', 'Total Mentions', 'Ratio']]
    for tag, mentions in unique_mention_dict.items():
        table.append([tag, len(mentions), tag_counts[tag], len(mentions) / tag_counts[tag]])

    print(tabulate.tabulate(table, headers="firstrow", tablefmt=args.format))


if __name__ == '__main__':
    main()