--- a +++ b/src/eval/summary_eval.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "code", + "id": "afdeba94", + "metadata": {}, + "source": [ + "import os\n", + "import sys\n", + "\n", + "src_path = os.path.abspath(\"../..\")\n", + "print(src_path)\n", + "sys.path.append(src_path)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "0d5f2e19", + "metadata": {}, + "source": "from src.utils import processed_data_path, set_seed, remote_project_path", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "e00815d2", + "metadata": {}, + "source": [ + "set_seed(seed=42)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "fd92d900", + "metadata": {}, + "source": [ + "import pandas as pd" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "ef32981d", + "metadata": {}, + "source": "model_path = os.path.join(remote_project_path, \"output\")", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "42a6306c", + "metadata": {}, + "source": [ + "import math\n", + "\n", + "\n", + "def ci95(s):\n", + " return 1.96 * s.std() / math.sqrt(len(s))" + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "def print_score(answer_filename, ci=True):\n", + " results = pd.read_json(os.path.join(model_path, f\"{answer_filename}/qa_output/answer_eval.jsonl\"), lines=True)\n", + " print(\"Size:\", len(results))\n", + " results[\"rel_score\"] = (results.score / results.base_score) * 100\n", + "\n", + " if not ci:\n", + " print(\"event:\")\n", + " df = results[results.source == \"event\"]\n", + " print(f\"GPT4 score: {df.base_score.mean():.4f} +- {df.base_score.std():.4f}\")\n", + " print(f\"LLM score: {df.score.mean():.4f} +- {df.score.std():.4f}\")\n", + " print(f\"Relative: {df.rel_score.mean():.4f} +- {df.rel_score.std():.4f}\")\n", + "\n", + " print(\"note:\")\n", + " df = results[results.source == \"note\"]\n", + " print(f\"GPT4 score: {df.base_score.mean():.4f} +- {df.base_score.std():.4f}\")\n", + " print(f\"LLM score: {df.score.mean():.4f} +- {df.score.std():.4f}\")\n", + " print(f\"Relative: {df.rel_score.mean():.4f} +- {df.rel_score.std():.4f}\")\n", + "\n", + " print(\"overall:\")\n", + " print(f\"GPT4 score: {results.base_score.mean():.4f} +- {results.base_score.std():.4f}\")\n", + " print(f\"LLM score: {results.score.mean():.4f} +- {results.score.std():.4f}\")\n", + " print(f\"Relative: {results.rel_score.mean():.4f} +- {results.rel_score.std():.4f}\")\n", + "\n", + " else:\n", + " print(\"event:\")\n", + " df = results[results.source == \"event\"]\n", + " print(f\"GPT4 score: {df.base_score.mean():.4f} +- {ci95(df.base_score):.4f}\")\n", + " print(f\"LLM score: {df.score.mean():.4f} +- {ci95(df.score):.4f}\")\n", + " print(f\"Relative: {df.rel_score.mean():.4f} +- {ci95(df.rel_score):.4f}\")\n", + "\n", + " print(\"note:\")\n", + " df = results[results.source == \"note\"]\n", + " print(f\"GPT4 score: {df.base_score.mean():.4f} +- {ci95(df.base_score):.4f}\")\n", + " print(f\"LLM score: {df.score.mean():.4f} +- {ci95(df.score):.4f}\")\n", + " print(f\"Relative: {df.rel_score.mean():.4f} +- {ci95(df.rel_score):.4f}\")\n", + "\n", + " print(\"overall:\")\n", + " print(f\"GPT4 score: {results.base_score.mean():.4f} +- {ci95(results.base_score):.4f}\")\n", + " print(f\"LLM score: {results.score.mean():.4f} +- {ci95(results.score):.4f}\")\n", + " print(f\"Relative: {results.rel_score.mean():.4f} +- {ci95(results.rel_score):.4f}\")\n", + "\n", + " return" + ], + "id": "fbcddd14", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": "print_score(\"llemr_vicuna\")", + "id": "9df914e9", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "67bd046750285528" + } + ], + "metadata": { + "kernelspec": { + "display_name": "llm", + "language": "python", + "name": "llm" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}