158 lines (157 with data), 4.7 kB
{
"cells": [
{
"cell_type": "code",
"id": "afdeba94",
"metadata": {},
"source": [
"import os\n",
"import sys\n",
"\n",
"src_path = os.path.abspath(\"../..\")\n",
"print(src_path)\n",
"sys.path.append(src_path)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "0d5f2e19",
"metadata": {},
"source": "from src.utils import processed_data_path, set_seed, remote_project_path",
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "e00815d2",
"metadata": {},
"source": [
"set_seed(seed=42)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "fd92d900",
"metadata": {},
"source": [
"import pandas as pd"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "ef32981d",
"metadata": {},
"source": "model_path = os.path.join(remote_project_path, \"output\")",
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "42a6306c",
"metadata": {},
"source": [
"import math\n",
"\n",
"\n",
"def ci95(s):\n",
" return 1.96 * s.std() / math.sqrt(len(s))"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"def print_score(answer_filename, ci=True):\n",
" results = pd.read_json(os.path.join(model_path, f\"{answer_filename}/qa_output/answer_eval.jsonl\"), lines=True)\n",
" print(\"Size:\", len(results))\n",
" results[\"rel_score\"] = (results.score / results.base_score) * 100\n",
"\n",
" if not ci:\n",
" print(\"event:\")\n",
" df = results[results.source == \"event\"]\n",
" print(f\"GPT4 score: {df.base_score.mean():.4f} +- {df.base_score.std():.4f}\")\n",
" print(f\"LLM score: {df.score.mean():.4f} +- {df.score.std():.4f}\")\n",
" print(f\"Relative: {df.rel_score.mean():.4f} +- {df.rel_score.std():.4f}\")\n",
"\n",
" print(\"note:\")\n",
" df = results[results.source == \"note\"]\n",
" print(f\"GPT4 score: {df.base_score.mean():.4f} +- {df.base_score.std():.4f}\")\n",
" print(f\"LLM score: {df.score.mean():.4f} +- {df.score.std():.4f}\")\n",
" print(f\"Relative: {df.rel_score.mean():.4f} +- {df.rel_score.std():.4f}\")\n",
"\n",
" print(\"overall:\")\n",
" print(f\"GPT4 score: {results.base_score.mean():.4f} +- {results.base_score.std():.4f}\")\n",
" print(f\"LLM score: {results.score.mean():.4f} +- {results.score.std():.4f}\")\n",
" print(f\"Relative: {results.rel_score.mean():.4f} +- {results.rel_score.std():.4f}\")\n",
"\n",
" else:\n",
" print(\"event:\")\n",
" df = results[results.source == \"event\"]\n",
" print(f\"GPT4 score: {df.base_score.mean():.4f} +- {ci95(df.base_score):.4f}\")\n",
" print(f\"LLM score: {df.score.mean():.4f} +- {ci95(df.score):.4f}\")\n",
" print(f\"Relative: {df.rel_score.mean():.4f} +- {ci95(df.rel_score):.4f}\")\n",
"\n",
" print(\"note:\")\n",
" df = results[results.source == \"note\"]\n",
" print(f\"GPT4 score: {df.base_score.mean():.4f} +- {ci95(df.base_score):.4f}\")\n",
" print(f\"LLM score: {df.score.mean():.4f} +- {ci95(df.score):.4f}\")\n",
" print(f\"Relative: {df.rel_score.mean():.4f} +- {ci95(df.rel_score):.4f}\")\n",
"\n",
" print(\"overall:\")\n",
" print(f\"GPT4 score: {results.base_score.mean():.4f} +- {ci95(results.base_score):.4f}\")\n",
" print(f\"LLM score: {results.score.mean():.4f} +- {ci95(results.score):.4f}\")\n",
" print(f\"Relative: {results.rel_score.mean():.4f} +- {ci95(results.rel_score):.4f}\")\n",
"\n",
" return"
],
"id": "fbcddd14",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": "print_score(\"llemr_vicuna\")",
"id": "9df914e9",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
"id": "67bd046750285528"
}
],
"metadata": {
"kernelspec": {
"display_name": "llm",
"language": "python",
"name": "llm"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}