[780764]: / src / eval / summary_eval.ipynb

Download this file

158 lines (157 with data), 4.7 kB

{
 "cells": [
  {
   "cell_type": "code",
   "id": "afdeba94",
   "metadata": {},
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "src_path = os.path.abspath(\"../..\")\n",
    "print(src_path)\n",
    "sys.path.append(src_path)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "0d5f2e19",
   "metadata": {},
   "source": "from src.utils import processed_data_path, set_seed, remote_project_path",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "e00815d2",
   "metadata": {},
   "source": [
    "set_seed(seed=42)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "fd92d900",
   "metadata": {},
   "source": [
    "import pandas as pd"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "ef32981d",
   "metadata": {},
   "source": "model_path = os.path.join(remote_project_path, \"output\")",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "42a6306c",
   "metadata": {},
   "source": [
    "import math\n",
    "\n",
    "\n",
    "def ci95(s):\n",
    "    return 1.96 * s.std() / math.sqrt(len(s))"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "def print_score(answer_filename, ci=True):\n",
    "    results = pd.read_json(os.path.join(model_path, f\"{answer_filename}/qa_output/answer_eval.jsonl\"), lines=True)\n",
    "    print(\"Size:\", len(results))\n",
    "    results[\"rel_score\"] = (results.score / results.base_score) * 100\n",
    "\n",
    "    if not ci:\n",
    "        print(\"event:\")\n",
    "        df = results[results.source == \"event\"]\n",
    "        print(f\"GPT4 score: {df.base_score.mean():.4f} +- {df.base_score.std():.4f}\")\n",
    "        print(f\"LLM score: {df.score.mean():.4f} +- {df.score.std():.4f}\")\n",
    "        print(f\"Relative: {df.rel_score.mean():.4f} +- {df.rel_score.std():.4f}\")\n",
    "\n",
    "        print(\"note:\")\n",
    "        df = results[results.source == \"note\"]\n",
    "        print(f\"GPT4 score: {df.base_score.mean():.4f} +- {df.base_score.std():.4f}\")\n",
    "        print(f\"LLM score: {df.score.mean():.4f} +- {df.score.std():.4f}\")\n",
    "        print(f\"Relative: {df.rel_score.mean():.4f} +- {df.rel_score.std():.4f}\")\n",
    "\n",
    "        print(\"overall:\")\n",
    "        print(f\"GPT4 score: {results.base_score.mean():.4f} +- {results.base_score.std():.4f}\")\n",
    "        print(f\"LLM score: {results.score.mean():.4f} +- {results.score.std():.4f}\")\n",
    "        print(f\"Relative: {results.rel_score.mean():.4f} +- {results.rel_score.std():.4f}\")\n",
    "\n",
    "    else:\n",
    "        print(\"event:\")\n",
    "        df = results[results.source == \"event\"]\n",
    "        print(f\"GPT4 score: {df.base_score.mean():.4f} +- {ci95(df.base_score):.4f}\")\n",
    "        print(f\"LLM score: {df.score.mean():.4f} +- {ci95(df.score):.4f}\")\n",
    "        print(f\"Relative: {df.rel_score.mean():.4f} +- {ci95(df.rel_score):.4f}\")\n",
    "\n",
    "        print(\"note:\")\n",
    "        df = results[results.source == \"note\"]\n",
    "        print(f\"GPT4 score: {df.base_score.mean():.4f} +- {ci95(df.base_score):.4f}\")\n",
    "        print(f\"LLM score: {df.score.mean():.4f} +- {ci95(df.score):.4f}\")\n",
    "        print(f\"Relative: {df.rel_score.mean():.4f} +- {ci95(df.rel_score):.4f}\")\n",
    "\n",
    "        print(\"overall:\")\n",
    "        print(f\"GPT4 score: {results.base_score.mean():.4f} +- {ci95(results.base_score):.4f}\")\n",
    "        print(f\"LLM score: {results.score.mean():.4f} +- {ci95(results.score):.4f}\")\n",
    "        print(f\"Relative: {results.rel_score.mean():.4f} +- {ci95(results.rel_score):.4f}\")\n",
    "\n",
    "    return"
   ],
   "id": "fbcddd14",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print_score(\"llemr_vicuna\")",
   "id": "9df914e9",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "",
   "id": "67bd046750285528"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "llm"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}