--- a +++ b/Code/Drug Discovery/ProtGPT2/Protein Generator II, kkawchak.ipynb @@ -0,0 +1,308 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "-oTEyfkwNdmj" + }, + "outputs": [], + "source": [ + "# from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel\n", + "# import time\n", + "# import torch\n", + "# import math" + ] + }, + { + "cell_type": "code", + "source": [ + "# Ref 1: Hugging Face, nferruz/ProtGPT2. https://huggingface.co/nferruz/ProtGPT2\n", + "# Ref 2: Hayes, J. Medium, 2023 https://medium.com/labs-notebook/large-language-models-for-drug-discovery-7ddfc005e0bb\n", + "# Ref 3: HF Intro, AssemblyAI 2022 https://www.youtube.com/watch?v=QEaBAZQCtwE&t=4s\n", + "# Ref 4: ChatGPT3.5 Code assist 2024 https://chat.openai.com/" + ], + "metadata": { + "id": "jzr3h73L1w-7" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "seconds = time.time()\n", + "print(\"Time in seconds since beginning of run:\", seconds)\n", + "local_time = time.ctime(seconds)\n", + "print(local_time)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "2K0dcoVBuBGy", + "outputId": "d0ad8867-b992-415c-d6a9-a6ebf3994229" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Time in seconds since beginning of run: 1712810958.7090964\n", + "Thu Apr 11 04:49:18 2024\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from transformers import pipeline\n", + "protgpt2 = pipeline('text-generation', model=\"nferruz/ProtGPT2\")\n", + "sequences = protgpt2(\"F\", max_length=300, do_sample=True, top_k=950, repetition_penalty=1.2, num_return_sequences=10, eos_token_id=0)\n", + "for seq in sequences:\n", + " print(seq)\n", + "# Hugging Face, https://huggingface.co/nferruz/ProtGPT2. https://huggingface.co/nferruz/ProtGPT2\n", + "# In the example below, ProtGPT2 generates sequences that follow the amino acid 'M'. Any other amino acid, oligomer, fragment, or protein of choice can be selected instead. The model will generate the most probable sequences that follow the input. Alternatively, the input field can also be left empty and it will choose the starting tokens." + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "rpEKO3y3On8l", + "outputId": "bdaf0fc7-7a7e-482f-d534-5a257aff6c7e" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'generated_text': 'F\\nAACDQQSGDINESLHDEPNFLSFRACRPNQSATQTHWKLNVSTKLTWAIINLGISLGTIL\\nVIIMWKPLFHKRFWSDYFVYLRDLHLRIQHPPAPIQLATALSKESWKDGKGRTQPLESTS\\nGDHQELRTSRGLQPQDEKVHEKSFCISNGEDGITGRFNTALSDGDTEKSSHLIDSVCNEE\\nAEEESDVPTCDLDHEQALIGTELDCRIHRLFRRRCYSPTVSSLERVGSQGSVV\\n'}\n", + "{'generated_text': 'F\\nKKTLDISRKTRDMYQYMNTYIVDDYRSVSLNEFRNICNKYLKTIEEYNVFYKNNPKSSNS\\nAKYSKLLTNIRSVETCVNVLLHT\\n'}\n", + "{'generated_text': 'FMGSFSPDLIYLKNYVPNN\\nKELKVHRDLVAWIKNEDIEKEFLKNTTLSYVKYIQKKIKNGLSEFDKIKVLGYTLGILFH\\nIILDRIAIPIIYNSKEKFGVEVFLDDFKYFLCNRYNFRPINNIDVVNVLRNNYKKSNDYK\\nTKLFVERYLTYYFNQKKLEYNKAKSLFKSNYTLQLLGSFFGYDIIKKLNKKYESTDFYNL\\nGKELVDKYFENKINKINIEECFNNVCTNLIYIKEKVLYDYINNYTSLLNILKNEDLKNEL\\nLNKYQDIYEKEKLKFYEKNYWRFRIKNKEYIIKENEYLVNLIINNTDYRNYKKYKSTYCS\\nKCNVKLKPKRNHTQYCSVICKKESKRKRSQ\\n'}\n", + "{'generated_text': 'F\\nPSGVRRTAVSRLLELPGEDPITTIVACGGRIEYRRGTELPPGSLDLPHERHLPWIRAALA\\nTLGLTPDELEDVDDDLTRLGVDSVSTAAMADELSGLWGLDISASVVFDHSSLRQIAGSLV\\nAEAPLTEAGA\\n'}\n", + "{'generated_text': 'FAFLGLLLYGVSLYSWPMGWNSGYYRHYRGCWVRCRHPHRYWR\\n'}\n", + "{'generated_text': 'F\\n'}\n", + "{'generated_text': 'F\\nARWIAEGAPARGPPGVGIAPTGALLVRSRTAWTPPALVAALRALLGRFGAVAVAEPGLTL\\nLPLDAEGRVSRARWDVAKLDAARSRLGRPYAPWLALPDLGVAATAWAAAAGGAPRLVVVD\\nAPPAVAPHADDPIRTRGDRVLASRVLLDEIARAPTGARRVVRALPRGAVAPDAVLADARA\\nRGVRLHALLGAPGPRRLLAWSASEAAPDAIEALARLRAALAAAGIRFSVRGGRTPAAAVV\\nAAWREAEAPAGSAAPATPSVPPPPEAPPEPVRPARRRPSRSSTAPRAPGLGGAAPEAGRR\\nAPRPAPRRPRAAAPPPPPPPPSPPPDRATPPPHDAPAPTAAPPRPTRLAARARPAGDAPQ\\nPPRSTDDDDAPPPPPDDEAPSPPRAHARLDPMDALAAALRAVGVRASDASARLPDGTVVC\\nSATGGPDAGADADLAALEARLDALEAGLERATAAAASAALQPMRARLAAFAARLPRPRED\\nDGAEPAEEATPEHGRPPATP\\n'}\n", + "{'generated_text': 'FVQDKVELSENWSVVAGLRYDKFDQNIIDNSNLDFNALSPRIGITWN\\nPLNDTTIKFMYGGAFRPPPALEQSFTDPDAGTSFFGNPDLDAEEVRSYELSIEQQIGDWA\\nLGVATFRNDIDGAIYRRESGTGDSSFINFDSASIKGAELIAQKYVGEWWNLVANYSLQEA\\nEDLSGNELSNTPEHTVSVSLGTNLADWGVETTLRYVGRQRDSEQGSYIGIPSDTLVNLFF\\nSAKVDLPRNVDMFIGVNNIFDEEYRTPGADDAYLEEPREVRLGLNVRW\\n'}\n", + "{'generated_text': 'F\\nDMFKQDIILSCNDCNLLKTSKLLNINRNTLSQRINHWNIELDIRNVLNKGI\\n'}\n", + "{'generated_text': 'FLLLSVGYAQQEVVL\\nPVITNENNVVLTSDELKKIEDKNNVYYSNAVSETNVAEAFIPLANNYDQYFLTSEKKPKF\\nGIVRYQKGDINYYLTISNKYKDSIVRYFEYNILSQKITAIYNLKYPAEISLNFNLDYPKA\\nLIVSYNENKVYLSDYYAFEIDKDLNLEIKKQIPIYSNILDSEKIIGKSSYDTFSIKTNST\\nFYKVNILNKEKKVTDIEVPTAISSNQSHYNSFPFFNYISLESNKLVLFYFSYFDEDGNTW\\nYGPDKVKTYVYDSDFNIQWMEEIRIPSSNKWFSEPVDIVKTKNDTYLVLTHYYSLFKGNW\\nDKTPTNNELLLISFDKATGELKEESFASIPEKELLSAIPVVGQSLYKDQNTVSFQRDKIV\\nSWGVKNKTRQ\\n'}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "sequence1='FYLSITIHRPLRPSSSSFLSLCLSLLSISIYYPS\\nLLIRRFTSISSCSSITIYHPLLYPSPSSLFLSLSHTYIYISPLHPSSLLLSISLLFYLSI\\nYIIYPLQPSSLLLSISLPLSISIYLSYPPLSSPSPSLSLYLTPFLLIPSLSIYLSLPFPY\\nHSYLYLRLLFHPPLPLHICHLPHSLTLFIFLLPPHLSHLPILFSRLQPFYPSTSPSSYRP\\nLPCIPSASYFSYHPLSPPPSLHPHPLSYPSVSRPSPPYLSIHLHSPPPPPPPSPFSSIHP\\nPFLSSTLPLPSSTSSLPPSSSPFSSTHLIPSPSSPPPPSLLPSSLPL'\n", + "\n", + "# Convert the sequence to a string like this\n", + "# (note we have to introduce new line characters every 60 amino acids,\n", + "# following the FASTA file format).\n", + "# Sequence 1\n", + "\n", + "sequence1 = \"<|endoftext|>FYLSITIHRPLRPSSSSFLSLCLSLLSISIYYPS\\nLLIRRFTSISSCSSITIYHPLLYPSPSSLFLSLSHTYIYISPLHPSSLLLSISLLFYLSI\\nYIIYPLQPSSLLLSISLPLSISIYLSYPPLSSPSPSLSLYLTPFLLIPSLSIYLSLPFPY\\nHSYLYLRLLFHPPLPLHICHLPHSLTLFIFLLPPHLSHLPILFSRLQPFYPSTSPSSYRP\\nLPCIPSASYFSYHPLSPPPSLHPHPLSYPSVSRPSPPYLSIHLHSPPPPPPPSPFSSIHP\\nPFLSSTLPLPSSTSSLPPSSSPFSSTHLIPSPSSPPPPSLLPSSLPL<|endoftext|>\"\n", + "\n", + "protgpt2 = pipeline('text-generation', model=\"nferruz/ProtGPT2\")\n", + "tokenizer = GPT2Tokenizer.from_pretrained(\"nferruz/ProtGPT2\")\n", + "model = GPT2LMHeadModel.from_pretrained(\"nferruz/ProtGPT2\")\n", + "\n", + "# ppl function\n", + "def calculatePerplexity(sequence, model, tokenizer):\n", + " input_ids = torch.tensor(tokenizer.encode(sequence)).unsqueeze(0)\n", + " input_ids = input_ids.to()\n", + " with torch.no_grad():\n", + " outputs = model(input_ids, labels=input_ids)\n", + " loss, logits = outputs[:2]\n", + " return math.exp(loss)\n", + "\n", + "#And hence:\n", + "ppl1 = calculatePerplexity(sequence1, model, tokenizer)\n", + "print(ppl1)\n", + "\n", + "# We do not yet have a threshold as to what perplexity value gives a 'good' or 'bad' sequence, but given the fast inference times, the best is to sample many sequences, order them by perplexity, and select those with the lower values (the lower the better)." + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "Lc1fnS5DSOxh", + "outputId": "0c1ffe93-b6df-48d9-9620-58adac99c78c" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "460.08118790732226\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "sequence3='FVFDPDTPDQVAVAVWRWAEAQCAG\\nIVYEDVSRELYERAVDRAFADLPDGLEHTWDCECGYCGDLPAEAIEAERAVEYAEQERAT\\nPAGPVAASWWAAGWRRIRGWFEN'\n", + "\n", + "# Convert the sequence to a string like this\n", + "# (note we have to introduce new line characters every 60 amino acids,\n", + "# following the FASTA file format).\n", + "# Sequence 3\n", + "\n", + "sequence3 = \"<|endoftext|>FVFDPDTPDQVAVAVWRWAEAQCAG\\nIVYEDVSRELYERAVDRAFADLPDGLEHTWDCECGYCGDLPAEAIEAERAVEYAEQERAT\\nPAGPVAASWWAAGWRRIRGWFEN<|endoftext|>\"\n", + "\n", + "protgpt2 = pipeline('text-generation', model=\"nferruz/ProtGPT2\")\n", + "tokenizer = GPT2Tokenizer.from_pretrained(\"nferruz/ProtGPT2\")\n", + "model = GPT2LMHeadModel.from_pretrained(\"nferruz/ProtGPT2\")\n", + "\n", + "# ppl function\n", + "def calculatePerplexity(sequence, model, tokenizer):\n", + " input_ids = torch.tensor(tokenizer.encode(sequence)).unsqueeze(0)\n", + " input_ids = input_ids.to()\n", + " with torch.no_grad():\n", + " outputs = model(input_ids, labels=input_ids)\n", + " loss, logits = outputs[:2]\n", + " return math.exp(loss)\n", + "\n", + "#And hence:\n", + "ppl3 = calculatePerplexity(sequence3, model, tokenizer)\n", + "print(ppl3)\n", + "\n", + "# ProtGPT2: We do not yet have a threshold as to what perplexity value gives a 'good' or 'bad' sequence, but given the fast inference times, the best is to sample many sequences, order them by perplexity, and select those with the lower values (the lower the better)." + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "oF-Lx9LkQmjF", + "outputId": "cefd6f8b-dedd-4abd-ef27-33e892a6745b" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2380.9905856892447\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Perplexity Sequence 1 - Sequence 3\n", + "# If positive, use Sequence 3. If negative use Sequence 1.\n", + "Perplexity_Difference = (ppl1-ppl3)\n", + "print(Perplexity_Difference)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "Lg346zq-U2ps", + "outputId": "9ba056ad-64b8-496f-f3bb-8764d3ef9dda" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "-1920.9093977819225\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def calculate_sequence_identity(seq1, seq3):\n", + " matches = sum(a == b for a, b in zip(seq1, seq3))\n", + " identity = matches / len(seq1)\n", + " return identity\n", + "\n", + "# A Sequence Identity of 1 means amino acids match at every position when aligned\n", + "identity = calculate_sequence_identity(sequence1, sequence3)\n", + "print(\"Sequence Identity:\", identity)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "YP8UG_hHYqaH", + "outputId": "a0afc4f5-5f77-487f-eb1c-81c8de596d09" + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Sequence Identity: 0.05113636363636364\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "seconds = time.time()\n", + "print(\"Time in seconds since end of run:\", seconds)\n", + "local_time = time.ctime(seconds)\n", + "print(local_time)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "TkiND_HHuERP", + "outputId": "ca4ba513-dd58-4d16-99f4-3b91f64eadd1" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Time in seconds since end of run: 1712811040.5421786\n", + "Thu Apr 11 04:50:40 2024\n" + ] + } + ] + } + ] +} \ No newline at end of file