--- a
+++ b/aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
@@ -0,0 +1,201 @@
+"""
+Class for loading PrimeKG dataset.
+"""
+
+import os
+import requests
+from tqdm import tqdm
+import pandas as pd
+from .dataset import Dataset
+
+class PrimeKG(Dataset):
+    """
+    Class for loading PrimeKG dataset.
+    It downloads the data from the Harvard Dataverse and stores it in the local directory.
+    The data is then loaded into pandas DataFrame of nodes and edges.
+    """
+
+    def __init__(self, local_dir: str = "../../../data/primekg/"):
+        """
+        Constructor for PrimeKG class.
+
+        Args:
+            local_dir (str): The local directory where the data will be stored.
+        """
+        self.name: str = "primekg"
+        self.server_path: str = "https://dataverse.harvard.edu/api/access/datafile/"
+        self.file_ids: dict = {"nodes": 6180617, "edges": 6180616}
+        self.local_dir: str = local_dir
+
+        # Attributes to store the data
+        self.nodes: pd.DataFrame = None
+        self.edges: pd.DataFrame = None
+
+        # Set up the dataset
+        self.setup()
+
+    def setup(self):
+        """
+        A method to set up the dataset.
+        """
+        # Make the directory if it doesn't exist
+        os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
+
+
+    def _download_file(self, remote_url:str, local_path: str):
+        """
+        A helper function to download a file from remote URL to the local directory.
+
+        Args:
+            remote_url (str): The remote URL of the file to be downloaded.
+            local_path (str): The local path where the file will be saved.
+        """
+        response = requests.get(remote_url, stream=True, timeout=300)
+        response.raise_for_status()
+        progress_bar = tqdm(
+            total=int(response.headers.get("content-length", 0)),
+            unit="iB",
+            unit_scale=True,
+        )
+        with open(local_path, "wb") as file:
+            for data in response.iter_content(1024):
+                progress_bar.update(len(data))
+                file.write(data)
+        progress_bar.close()
+
+    def _load_nodes(self) -> pd.DataFrame:
+        """
+        Private method to load the nodes dataframe of PrimeKG dataset.
+        This method downloads the nodes file from the Harvard Dataverse if it does not exist
+        in the local directory. Otherwise, it loads the data from the local directory.
+        It further processes the dataframe of nodes and returns it.
+
+        Returns:
+            The nodes dataframe of PrimeKG dataset.
+        """
+        local_file = os.path.join(self.local_dir, f"{self.name}_nodes.tsv.gz")
+        if os.path.exists(local_file):
+            print(f"{local_file} already exists. Loading the data from the local directory.")
+
+            # Load the dataframe from the local directory and assign it to the nodes attribute
+            nodes = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
+        else:
+            print(f"Downloading node file from {self.server_path}{self.file_ids['nodes']}")
+
+            # Download the file from the Harvard Dataverse with designated file_id for node
+            self._download_file(f"{self.server_path}{self.file_ids['nodes']}",
+                                os.path.join(self.local_dir, "nodes.tab"))
+
+            # Load the downloaded file into a pandas DataFrame
+            nodes = pd.read_csv(os.path.join(self.local_dir, "nodes.tab"),
+                                     sep="\t", low_memory=False)
+
+            # Further processing of the dataframe
+            nodes = nodes[
+                ["node_index", "node_name", "node_source", "node_id", "node_type"]
+            ]
+
+            # Store compressed dataframe in the local directory
+            nodes.to_csv(local_file, index=False, sep="\t", compression="gzip")
+
+        return nodes
+
+    def _load_edges(self, nodes: pd.DataFrame) -> pd.DataFrame:
+        """
+        Private method to load the edges dataframe of PrimeKG dataset.
+        This method downloads the edges file from the Harvard Dataverse if it does not exist
+        in the local directory. Otherwise, it loads the data from the local directory.
+        It further processes the dataframe of edges and returns it.
+
+        Args:
+            nodes (pd.DataFrame): The nodes dataframe of PrimeKG dataset.
+
+        Returns:
+            The edges dataframe of PrimeKG dataset.
+        """
+        local_file = os.path.join(self.local_dir, f"{self.name}_edges.tsv.gz")
+        if os.path.exists(local_file):
+            print(f"{local_file} already exists. Loading the data from the local directory.")
+
+            # Load the dataframe from the local directory and assign it to the edges attribute
+            edges = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
+        else:
+            print(f"Downloading edge file from {self.server_path}{self.file_ids['edges']}")
+
+            # Download the file from the Harvard Dataverse with designated file_id for edge
+            self._download_file(f"{self.server_path}{self.file_ids['edges']}",
+                                os.path.join(self.local_dir, "edges.csv"))
+
+            # Load the downloaded file into a pandas DataFrame
+            edges = pd.read_csv(os.path.join(self.local_dir, "edges.csv"),
+                                     sep=",", low_memory=False)
+
+            # Further processing of the dataframe
+            edges = edges.merge(
+                nodes, left_on="x_index", right_on="node_index"
+            )
+            edges.drop(["x_index"], axis=1, inplace=True)
+            edges.rename(
+                columns={
+                    "node_index": "head_index",
+                    "node_name": "head_name",
+                    "node_source": "head_source",
+                    "node_id": "head_id",
+                    "node_type": "head_type",
+                },
+                inplace=True,
+            )
+            edges = edges.merge(
+                nodes, left_on="y_index", right_on="node_index"
+            )
+            edges.drop(["y_index"], axis=1, inplace=True)
+            edges.rename(
+                columns={
+                    "node_index": "tail_index",
+                    "node_name": "tail_name",
+                    "node_source": "tail_source",
+                    "node_id": "tail_id",
+                    "node_type": "tail_type"
+                },
+                inplace=True,
+            )
+            edges = edges[
+                [
+                    "head_index", "head_name", "head_source", "head_id", "head_type",
+                    "tail_index", "tail_name", "tail_source", "tail_id", "tail_type",
+                    "display_relation", "relation",
+                ]
+            ]
+
+            # Store compressed dataframe in the local directory
+            edges.to_csv(local_file, index=False, sep="\t", compression="gzip")
+
+        return edges
+
+    def load_data(self):
+        """
+        Load the PrimeKG dataset into pandas DataFrame of nodes and edges.
+        """
+        print("Loading nodes of PrimeKG dataset ...")
+        self.nodes = self._load_nodes()
+
+        print("Loading edges of PrimeKG dataset ...")
+        self.edges = self._load_edges(self.nodes)
+
+    def get_nodes(self) -> pd.DataFrame:
+        """
+        Get the nodes dataframe of PrimeKG dataset.
+
+        Returns:
+            The nodes dataframe of PrimeKG dataset.
+        """
+        return self.nodes
+
+    def get_edges(self) -> pd.DataFrame:
+        """
+        Get the edges dataframe of PrimeKG dataset.
+
+        Returns:
+            The edges dataframe of PrimeKG dataset.
+        """
+        return self.edges