In [6]:
# !pip install bio
# from transformers import pipeline
# from Bio import pairwise2
# from Bio.Seq import Seq
# from Bio.SeqUtils import ProtParam
# import time

In [7]:
# Ref 1: Hayes, J. Medium, 2023 https://medium.com/labs-notebook/large-language-models-for-drug-discovery-7ddfc005e0bb
# Ref 2: HF Intro, AssemblyAI 2022 https://www.youtube.com/watch?v=QEaBAZQCtwE&t=4s
# Ref 3: ChatGPT3.5 Code assist 2024 https://chat.openai.com/

In [8]:
seconds = time.time()
print("Time in seconds since beginning of run:", seconds)
local_time = time.ctime(seconds)
print(local_time)

Time in seconds since beginning of run: 1712600025.153674
Mon Apr  8 18:13:45 2024


In [9]:
# Function to calculate percent similarity
def calculate_similarity(reference_sequence, generated_sequence):
    alignments = pairwise2.align.globalxx(reference_sequence, generated_sequence)
    top_alignment = alignments[0]
    similarity = (top_alignment[2] / len(reference_sequence)) * 100
    return similarity

# Function to calculate percent similarity based on molecular weights
def calculate_molecular_weight_similarity(reference_sequence, generated_sequence):
    try:
        if 'X' in generated_sequence:
            raise ValueError("'X' is not a valid unambiguous letter for protein")
        ref_mw = ProtParam.ProteinAnalysis(reference_sequence).molecular_weight()
        gen_mw = ProtParam.ProteinAnalysis(generated_sequence).molecular_weight()
        similarity = (1 - abs(ref_mw - gen_mw) / ref_mw) * 100
    except ZeroDivisionError:
        similarity = 0.0
    return similarity

# Function to calculate percent similarity based on isoelectric points
def calculate_isoelectric_point_similarity(reference_sequence, generated_sequence):
    try:
        if 'X' in generated_sequence:
            raise ValueError("'X' is not a valid unambiguous letter for protein")
        ref_pI = ProtParam.ProteinAnalysis(reference_sequence).isoelectric_point()
        gen_pI = ProtParam.ProteinAnalysis(generated_sequence).isoelectric_point()
        similarity = (1 - abs(ref_pI - gen_pI) / ref_pI) * 100
    except ZeroDivisionError:
        similarity = 0.0
    return similarity

# Reference sequence
reference_sequence = "MSKGEEVQNLFASGFRLDSAKTAAVLYGLDAGNSLSGGQPLFFSGLKPRQGVRKIAKELKVRMMDPNFIRVSGRSGKQVTMNEEAVSEFYELQGKINTFIKIVNTKEITFHKEFQYKDDPLPNKRNTIVADYFEKVLMVDNARLIAETTKPVLDAYKMDKVSNEVFVEV"

# Calculate original molecular weight and isoelectric point of reference sequence
ref_mw = ProtParam.ProteinAnalysis(reference_sequence).molecular_weight()
ref_pI = ProtParam.ProteinAnalysis(reference_sequence).isoelectric_point()

# Initialize ProtGPT2 pipeline
protgpt2 = pipeline('text-generation', model="nferruz/ProtGPT2")

# Generate sequences
sequences = protgpt2("", max_length=200, do_sample=True, top_k=950, repetition_penalty=1.2, num_return_sequences=10, eos_token_id=0)

# Calculate similarity for each generated sequence
for i, seq in enumerate(sequences):
    generated_sequence = seq["generated_text"]

    # Check if generated sequence contains ambiguous letters
    if 'X' not in generated_sequence:
        # Calculate original molecular weight and isoelectric point of generated sequence
        gen_mw = ProtParam.ProteinAnalysis(generated_sequence).molecular_weight()
        gen_pI = ProtParam.ProteinAnalysis(generated_sequence).isoelectric_point()

        # Calculate similarity measures
        similarity_sequence = calculate_similarity(reference_sequence, generated_sequence)
        similarity_mw = calculate_molecular_weight_similarity(reference_sequence, generated_sequence)
        similarity_pI = calculate_isoelectric_point_similarity(reference_sequence, generated_sequence)

        # Print results
        print("#")
        print("#")
        print(f"Reference Sequence: {reference_sequence}")
        print(f"Generated Sequence {i+1}: {generated_sequence}")
        print(f"Generated Similarity to Reference: {max(similarity_sequence, 0):.5f}%")
        print(f"Reference Molecular Weight: {ref_mw:.2f}")
        print(f"Generated Molecular Weight: {gen_mw:.2f}")
        print(f"Molecular Weight Similarity: {max(similarity_mw, 0):.5f}%")
        print(f"Reference Isoelectric Point: {ref_pI:.2f}")
        print(f"Generated Isoelectric Point: {gen_pI:.2f}")
        print(f"Isoelectric Point Similarity: {max(similarity_pI, 0):.5f}%")
        print("--------------------------")
    else:
        print(f"Generated Sequence {i+1} contains ambiguous letters and cannot be processed.")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


#
#
Reference Sequence: MSKGEEVQNLFASGFRLDSAKTAAVLYGLDAGNSLSGGQPLFFSGLKPRQGVRKIAKELKVRMMDPNFIRVSGRSGKQVTMNEEAVSEFYELQGKINTFIKIVNTKEITFHKEFQYKDDPLPNKRNTIVADYFEKVLMVDNARLIAETTKPVLDAYKMDKVSNEVFVEV
Generated Sequence 1: 
MPLPKKPFDLSKYTPDSDPDQFWGISPDGDMYVFKSGPFAKDNEDVVDAMLIDAALTVEA
HMDLRRDRLKDEVVIIGFEMDTSDISVESRRKYLDFQNFLLWGNQLLGCSARLLDNLDGS
DEETLMYCHIEPKTHDWVVDNIAHKPVPLPLADGKFIVPASSFMRRTMHFLAKNSGAGMQ
GYMKMIGMSEDEALEEMRKLPDNAEKLMQQQAEFGNAVGRASGFANTMGSAFLGGGANEF
DRQYQETMSNFTESIFKGLDTRDRLLNLLKISQPDFLKFLTMEDFSADLRSAAEAAKAIE
MGNIETSGQPSADQVTTNVTSNPFSDPEGNIATPWQAPVPIASPQRPIIAPAPTENKSRP
PVAAPDYENEPAQALSHPPVAAPAN

Generated Similarity to Reference: 52.07101%
Reference Molecular Weight: 19063.72
Generated Molecular Weight: 42499.42
Molecular Weight Similarity: 0.00000%
Reference Isoelectric Point: 9.00
Generated Isoelectric Point: 4.62
Isoelectric Point Similarity: 51.29180%
--------------------------
#
#
Reference Sequence: MSKGEEVQNLFASGFRLDSAKTAAVLYGLDAGNSLSGGQPLFFSGLKPRQGVRKIAKELKVRMMDPNFIRVSGR

In [10]:
seconds = time.time()
print("Time in seconds since end of run:", seconds)
local_time = time.ctime(seconds)
print(local_time)

Time in seconds since end of run: 1712600082.4905665
Mon Apr  8 18:14:42 2024
