In [1]:
!pip install benepar
!pip install spacy

Collecting benepar
  Downloading benepar-0.2.0.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting spacy>=2.0.9 (from benepar)
  Downloading spacy-3.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting torch-struct>=0.5 (from benepar)
  Downloading torch_struct-0.5-py3-none-any.whl (34 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy>=2.0.9->benepar)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy>=2.0.9->benepar)
  Downloading spacy_loggers-1.0.4-py3-none-any.whl (11 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy>=2.0.9->benepar)
  Downloading murmurhash-1.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy>=2.0.9->benepar)

In [1]:
import benepar, spacy
!python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')
doc = nlp("The time for action is now. It's never too late to do something.")

import matplotlib.pyplot as plt

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [2]:
def find_root_verb_and_its_dobj(tree_root):
    # first check if the current node and its children satisfy the condition
    if tree_root.pos_ == "VERB":
        for child in tree_root.children:
            if child.dep_ == "dobj" and child.pos_ == "NOUN":
                return tree_root.lemma_, child.lemma_
        return tree_root.lemma_, None
    # if not, check its children
    for child in tree_root.children:
        return find_root_verb_and_its_dobj(child)
    # if no children satisfy the condition, return None
    return None, None

def find_root_verb_and_its_dobj_in_string(s):
    doc = nlp(s)
    first_sent = list(doc.sents)[0]
    return find_root_verb_and_its_dobj(first_sent.root)

find_root_verb_and_its_dobj_in_string("Write me a story about education.")

('write', 'story')

In [3]:
import pandas as pd
import json
import tqdm
import os

dataset_name_list = ["pretrain", "instruct_10K", "instruct_60K", "instruct_60K_intext"]

folder_path = "../../data/"
file_path_list = [
    "2023-04-26_text_image_alignment.json",     
    "2023-04-28_conv_finetune_processed.json", 
    "2023-05-10_finetune_postprocess_caption_cleaned", 
    "2023-05-10_finetune_postprocess_caption_in_text_cleaned"
]
file_path_list = [os.path.join(folder_path, f) for f in file_path_list]
generated_data_path = file_path_list

dict_instruct_response = {}

for i, ds_name in enumerate(dataset_name_list):

    with open(generated_data_path[i], 'r') as fin:
        line_dict = json.load(fin)

    print(len(line_dict))
    instruct_list, response_list = [], []
    for line in line_dict:
        try:
            conversations = line['conversations']
        except:
            conversations = line['conversatons']

        for c in conversations:
            if c['from'] == "human":
                if "<image>" in c['value']:
                    for c1 in c['value'].split("\n"):
                        if "<image>" not in c1:
                            instruct_list.append(c1)
                else:
                    instruct_list.append(c['value'])
            elif c['from'] == "gpt":
                response_list.append(c['value'])

    dict_instruct_response[ ds_name ] = {"instruct": instruct_list, "response": response_list}


500000
10712
60088
60017


In [4]:
for k, v in dict_instruct_response.items():
    num_instruct, num_response = len(v['instruct']), len(v['response'])
    print(f'{k}: {num_instruct} | {num_response}')

pretrain: 500000 | 500000
instruct_10K: 52805 | 52805
instruct_60K: 171673 | 171673
instruct_60K_intext: 173696 | 173696


In [5]:

def compute_verb_noun_paris(sequences, csv_file_name):
    # print(sequences[:3])
    sequences = set(sequences) # if you are interested in studying the instructions, please change the task key
    print(f'# unique sequences {len(sequences)}')

    raw_phrases = []
    for seq in tqdm.tqdm(sequences):
        try:
            verb, noun = find_root_verb_and_its_dobj_in_string(seq)
            raw_phrases.append({
                "verb": verb,
                "noun": noun,
                "seq": seq
            })
        except Exception as e:
            print(e)
            print(seq)
    print(f'# raw_phrases {len(raw_phrases)}')
    raw_phrases = pd.DataFrame(raw_phrases)
    raw_phrases.to_csv(csv_file_name)  

In [31]:
import random

num_select = 100000
for k, v in dict_instruct_response.items():
    num_instruct, num_response = len(v['instruct']), len(v['response'])
    print(f'{k}: {num_instruct} | {num_response}')
    
    output_csv_folder_path = os.path.join(folder_path, 'csv')

    if num_instruct > num_select:
        instruct = random.sample(v['instruct'], num_select)
        response = random.sample(v['response'], num_select)
    else:
        instruct = v['instruct']
        response = v['response']

    compute_verb_noun_paris(instruct, f'{output_csv_folder_path}/{k}_instruction_verb_noun.csv') 
    compute_verb_noun_paris(response, f'{output_csv_folder_path}/{k}_response_verb_noun.csv') 


pretrain: 500000 | 500000
# unique sequences 27


100%|██████████| 27/27 [00:00<00:00, 240.38it/s]


# raw_phrases 27
# unique sequences 99748


  0%|          | 0/99748 [00:00<?, ?it/s]

list index out of range



100%|██████████| 99748/99748 [18:59<00:00, 87.52it/s] 


# raw_phrases 99747
instruct_10K: 52805 | 52805
# unique sequences 19117


100%|██████████| 19117/19117 [01:21<00:00, 234.93it/s]


# raw_phrases 19117
# unique sequences 47006


100%|██████████| 47006/47006 [05:27<00:00, 143.41it/s]


# raw_phrases 47006
instruct_60K: 171673 | 171673
# unique sequences 48401


100%|██████████| 48401/48401 [03:06<00:00, 258.84it/s]


# raw_phrases 48401
# unique sequences 94892


100%|██████████| 94892/94892 [12:17<00:00, 128.64it/s]


# raw_phrases 94892
instruct_60K_intext: 173696 | 173696
# unique sequences 51578


100%|██████████| 51578/51578 [03:23<00:00, 253.53it/s]


# raw_phrases 51578
# unique sequences 95344


100%|██████████| 95344/95344 [11:49<00:00, 134.43it/s]


# raw_phrases 95344


In [32]:
import plotly.graph_objects as go
import plotly.express as px

folder_path = "../../data/"

def visualize_verb_noun_paris(file_name, count_thredhold=10):
    raw_phrases = pd.read_csv(f'{file_name}.csv')
    raw_phrases = pd.DataFrame(raw_phrases)
    phrases = pd.DataFrame(raw_phrases).dropna()
    count_list = phrases[["verb", "noun"]].groupby(["verb", "noun"]).size().sort_values(ascending=False)
    print(len(count_list))

    top_verbs = phrases[["verb"]].groupby(["verb"]).size().nlargest(20).reset_index()

    df = phrases[phrases["verb"].isin(top_verbs["verb"].tolist())]
    # df = df[~df["noun"].isin(["I", "what"])]
    # df = phrases
    # df[~df["verb"].isin(top_verbs["verb"].tolist())]["verb"] = "other"
    # df[~df["verb"].isin(top_verbs["verb"].tolist())]["noun"] = "other"
    df = df.groupby(["verb", "noun"]).size().reset_index().rename(columns={0: "count"}).sort_values(by=["count"], ascending=False)
    # df = df[df["count"] > 10]
    df = df.groupby("verb").apply(lambda x: x.sort_values("count", ascending=False).head(4)).reset_index(drop=True)
    print(df)

    # df["blank"] = "ROOT"
    # df = phrases.groupby(["verb", "noun"]).size().sort_values(ascending=False).head(5).reset_index().rename(columns={0: "count"})

    df = df[df["count"] > count_thredhold]
    fig = px.sunburst(df, path=['verb', 'noun'], values='count')
    # fig.update_layout(uniformtext=dict(minsize=10, mode='hide'))
    fig.update_layout(
        margin=dict(l=0, r=0, t=0, b=0),
        font_family="Times New Roman",
    )
    # fig.show()
    output_html_folder_path = os.path.join(folder_path, 'output')
    file_name = file_name.split("/")[-1].split(".")[0]
    html_file_path = f"{output_html_folder_path}/{file_name}_{count_thredhold}.html"
    print(html_file_path)
    fig.write_html(html_file_path)
    # fig.savefig("output/verb_noun.pdf")


In [38]:
c_list = [30] # [0,10,20]
for k, v in dict_instruct_response.items():
    output_csv_folder_path = os.path.join(folder_path, 'csv')
    for c in c_list:
        visualize_verb_noun_paris(f'{output_csv_folder_path}/{k}_instruction_verb_noun', c)   
        visualize_verb_noun_paris(f'{output_csv_folder_path}/{k}_response_verb_noun', c)  


22
            verb            noun  count
0        analyze           image      1
1          break         element      1
2   characterize           image      1
3        clarify         content      1
4         create       narrative      1
5       describe           image      2
6        examine           image      1
7        explain          aspect      1
8           give     explanation      2
9     illustrate           image      1
10         offer     explanation      1
11         offer        analysis      1
12       portray           image      1
13       present     description      1
14       provide     description      2
15         relay         account      1
16        render         summary      1
17         share         rundown      1
18         share  interpretation      1
19     summarize         content      1
20         write       depiction      1
21         write         summary      1
../../data/output/pretrain_instruction_verb_noun_30.html
8626
        verb   

Bad pipe message: %s [b"K\xc6\x1b\xda\xfd\x1a\x16[s7\xbfz%\x11:\x0f\xc2%\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/"]
Bad pipe message: %s [b'\x95\x8a`\xb1\xd9]\x93\xcd\xbc9aa\x03K\xf4\xf7\xa5\xc7']
Bad pipe message: %s [b'\xa4f\xeb\x91\x87\x02\x17$\x85q^\x82\x86\x94\x9c\xa7tq\x00\x00\xa2\xc0\x14\xc0\n\x009\x008\x007\x006\x00\x88\x00\x87\x00\x86\x00\x85\xc0\x19\x00:\x00\x89\xc0\x0f\xc0\x05\x005\x00\x84\xc0\x13\xc0\t\x003\x002\x001\x000\x00\x9a\x00\x99\x00\x98\x00\x97\x00E\x00D\x00C\x00B\xc0\x18\x004\x00\x9b\x00F\xc0\x0e\xc0\x04\x00/\x00\x96\x00A\x00\x07\xc0\x11\xc0\x07\xc0\x16\x00\x18\xc0\x0c\xc0\x02\x00\x05\x00\x04\xc0\x12\xc0\x08\x00\x16\x00', b'\x10\x00\r\xc0\x17\x00\x1b\xc0\