DrugCLIP / Git / [b40915] /HomoAug/run

Models:
Amanda-D/
DrugCLIP
Downloads: 1
[b40915]: / HomoAug / run_HomoAug.py
History
Download this file
838 lines (770 with data), 29.7 kB

# Copyright © 2023 Institute for AI Industry Research (AIR), Tsinghua University.
# License: GNU GPLv3. [See details in LICENSE]

import argparse
import os
import glob
import Bio.PDB
import ray
import random
import json
import mmap
import time
import sys
import traceback
from ray.util.queue import Queue
import numpy as np

sys.path.append(".")
from utils.misc import execute
from utils.ray_tools import ProgressBar
from tqdm import tqdm
import pathlib
import subprocess
import shutil


class JackhmmerRunner:
    def __init__(self, database_dir, task_ids, seq_dir, output_dir, n_thread):
        self.N_CPU_PER_THREAD = 1
        self.n_thread = n_thread
        self.database_dir = database_dir
        self.task_ids = task_ids
        self.seq_dir = seq_dir
        self.output_dir = output_dir
        if output_dir.endswith("/"):
            output_dir = output_dir[:-1]
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        self.sto_dir = output_dir + "_sto"
        if not os.path.exists(self.sto_dir):
            os.mkdir(self.sto_dir)
        self.a2m_dir = output_dir + "_a2m"
        if not os.path.exists(self.a2m_dir):
            os.mkdir(self.a2m_dir)

    def split_list(_list, n):
        chunk_size = (len(_list) - 1) // n + 1
        chunks = [_list[i * chunk_size : (i + 1) * chunk_size] for i in range(n)]
        return chunks

    @ray.remote(num_cpus=1)
    def process_jobs(self, id, jobs_queue, actor):
        print("start process", id)
        while not jobs_queue.empty():
            job = jobs_queue.get()
            try:
                self.execute_one_job(job)

            except:
                print(f"failed: {job}")
                traceback.print_exception(*sys.exc_info())
            try:
                actor.update.remote(1)
            except:
                pass
        return 1

    def execute_one_job(self, job):
        seq_file_Path = pathlib.Path(self.seq_dir) / job[:4] / (job + ".fasta")
        output_file_Path = pathlib.Path(self.sto_dir) / (job + ".sto")
        execute(
            f"jackhmmer"
            f" --cpu {self.N_CPU_PER_THREAD}"
            f" -A {output_file_Path}"
            f" -o /dev/null"
            f" -E 0.001"
            f" -N 3"
            f" {str(seq_file_Path)}"
            f" {self.database_dir}"
        )
        # with open(output_file_Path,"w") as f:
        #     f.write("")
        # # sleep 1 s
        # time.sleep(1)
        return 1

    def change_sto_to_fasta(self):
        sto_files = glob.glob(self.sto_dir + "/*.sto")
        for sto_path in tqdm(sto_files):
            pdb_id = sto_path.split("/")[-1].split(".")[0]
            # print(pdb_id)
            a2m_path = self.a2m_dir + f"/{pdb_id}.a2m"
            execute(
                f"esl-reformat --informat stockholm"
                f" -o {str(a2m_path)} a2m"
                f" {str(sto_path)}"
            )
            fasta_path = self.output_dir + f"/{pdb_id}.fasta"
            output = ""
            with open(a2m_path) as f:
                for line in f:
                    line = line.strip()
                    if line[0] != ">":
                        output += line
                    else:
                        output += "\n" + line + "\n"
            output = output.strip()
            output = output.split("\n")[:-2]
            output = "\n".join(output)
            with open(fasta_path, "w") as f:
                f.write(output)

    def run_jackhmmer(self):
        all_jobs = []
        with open(self.task_ids, "r") as f:
            data = f.readlines()
            for line in data:
                job = line.strip()
                job = job[:4].upper()
                all_jobs.append(job)
        print("all jobs:", len(all_jobs))
        uncompleted_jobs = all_jobs
        # completed?
        # uncompleted_jobs=[]
        # for job in all_jobs:
        #     if (not is_complete(job)):
        #         uncompleted_jobs.append(job)
        # print("uncompleted jobs:",len(uncompleted_jobs))
        ray.init()
        job_queue = Queue()
        for job in tqdm(uncompleted_jobs):
            job_queue.put(job)
        print("job queue size:", job_queue.qsize())
        pb = ProgressBar(len(all_jobs))
        actor = pb.actor
        print("actor:", actor)
        job_id_list = []
        self.n_thread = min(self.n_thread, len(uncompleted_jobs))
        for i in range(self.n_thread):
            job_id_list.append(self.process_jobs.remote(self, i, job_queue, actor))
        pb.print_until_done()
        result = ray.get(job_id_list)
        print("Run homo search done!")
        ray.shutdown()

        # remove tmp folders
        shutil.rmtree(self.a2m_dir)
        shutil.rmtree(self.sto_dir)

        # change sto to fasta
        self.change_sto_to_fasta()


class LigandPocketExtractor:
    def __init__(self, id_file, homoaug_dir, n_thread):
        self.id_file = id_file
        self.homoaug_dir = homoaug_dir
        self.n_thread = n_thread
        self.read_ligand_name_chain_name()

    def read_ligand_name_chain_name(self):
        # read ligand_name
        ligand_name = {}
        chain_name = {}
        with open(self.id_file, "r") as f:
            for line in f.readlines():
                line = line.strip()
                id = line[:4].upper()
                ligand_name[id] = line.split("_")[-1]
                chain_name[id] = line.split("_")[1]
        # print("ligand_name:",ligand_name)
        self.ligand_name = ligand_name
        self.chain_name = chain_name

    def execute_one_job(self, job):
        id = job
        # print ('############################')
        # print (id)
        # read original pdb
        pdb_file = self.homoaug_dir + "/" + id + "/" + id + "_protein.pdb"
        if not os.path.exists(pdb_file):
            print("no pdb file for id:", id)
            return 1
        pdb_parser = Bio.PDB.PDBParser(QUIET=True)
        structure = pdb_parser.get_structure(id, pdb_file)
        model = structure[0]
        chain = model[self.chain_name[id]]
        # get pocket_chain to a new pdb
        pocket_chain = Bio.PDB.Chain.Chain(self.chain_name[id])
        for residue in chain:
            if residue.id[0] == " ":
                pocket_chain.add(residue)
        io = Bio.PDB.PDBIO()
        io.set_structure(pocket_chain)
        io.save(self.homoaug_dir + "/" + id + "/" + id + "_pocket_chain.pdb")

        # save ligand to a new pdb
        ligand_chain = Bio.PDB.Chain.Chain(self.chain_name[id])
        ligand_found = False
        for residue in chain:
            if residue.resname == self.ligand_name[id]:
                ligand_chain.add(residue)
                ligand_found = True
        if ligand_found:
            io = Bio.PDB.PDBIO()
            io.set_structure(ligand_chain)
            io.save(self.homoaug_dir + "/" + id + "/" + id + "_ligand.pdb")
        else:
            print("ligand not found:", id)
            return 1

        # remove ligand in the id.pocket.pdb
        pdb_file = self.homoaug_dir + "/" + id + "/" + id + "_pocket.pdb"
        pdb_parser = Bio.PDB.PDBParser(QUIET=True)
        structure = pdb_parser.get_structure(id, pdb_file)
        model = structure[0]
        chain = model["R"]
        # get pocket to a new pdb
        pocket = Bio.PDB.Chain.Chain("A")
        for residue in chain:
            if residue.id[0] == " ":
                pocket.add(residue)
        # write pocket pdb
        io = Bio.PDB.PDBIO()
        io.set_structure(pocket)
        io.save(self.homoaug_dir + "/" + id + "/" + id + "_pocket.pdb")

    @ray.remote(num_cpus=1)
    def process_jobs(self, id, jobs_queue, actor):
        print("start process", id)
        while not jobs_queue.empty():
            job = jobs_queue.get()
            try:
                self.execute_one_job(job)
            except:
                print(f"failed: {job}")
                traceback.print_exception(*sys.exc_info())
            try:
                actor.update.remote(1)
            except:
                pass
        return 1

    def run(self):
        all_jobs = glob.glob(self.homoaug_dir + "/*")
        all_jobs = [x.split("/")[-1] for x in all_jobs]
        uncompleted_jobs = all_jobs
        job_queue = Queue()
        for job in tqdm(uncompleted_jobs):
            job_queue.put(job)
        print("job queue size:", job_queue.qsize())
        pb = ProgressBar(len(all_jobs))
        actor = pb.actor
        job_id_list = []
        self.n_thread = min(self.n_thread, len(uncompleted_jobs))
        for i in range(self.n_thread):
            job_id_list.append(self.process_jobs.remote(self, i, job_queue, actor))
        pb.print_until_done()
        result = ray.get(job_id_list)
        ray.shutdown()
        print("Done!")


class PocketPositionExtractor:
    def __init__(self, homoaug_dir, n_thread):
        self.homoaug_dir = homoaug_dir
        self.n_thread = n_thread
        self.aa_3_to_1 = {
            "CYS": "C",
            "ASP": "D",
            "SER": "S",
            "GLN": "Q",
            "LYS": "K",
            "ILE": "I",
            "PRO": "P",
            "THR": "T",
            "PHE": "F",
            "ASN": "N",
            "GLY": "G",
            "HIS": "H",
            "LEU": "L",
            "ARG": "R",
            "TRP": "W",
            "ALA": "A",
            "VAL": "V",
            "GLU": "E",
            "TYR": "Y",
            "MET": "M",
            "MSE": "M",
            "CME": "C",
            "CSO": "C",
            "UNK": "X",
        }

    def execute_one_job(self, job):
        id = job
        # print ('############################')
        # print (id)

        # read pocket_pdb
        pocket_pdb_file = self.homoaug_dir + "/" + id + "/" + id + "_pocket.pdb"
        pocket_pdb_structure = Bio.PDB.PDBParser().get_structure(id, pocket_pdb_file)
        model = pocket_pdb_structure[0]
        for chain in model:
            pocket_chain_id = chain.id
            break
        chain = model[pocket_chain_id]

        # get the pocket atom coordinates
        pocket_atom_coordinates = set()
        for residue in chain:
            if residue.id[0] != " ":
                continue
            for atom in residue:
                pocket_atom_coordinates.add(tuple(atom.get_coord()))
        # print("pocket_atom_coordinates:",pocket_atom_coordinates)

        sequence = ""
        pocket_chain_pdb_file = (
            self.homoaug_dir + "/" + id + "/" + id + "_pocket_chain.pdb"
        )
        # get the position of the pocket residues in the pocket chain
        pocket_chain_structure = Bio.PDB.PDBParser().get_structure(
            id, pocket_chain_pdb_file
        )
        model = pocket_chain_structure[0]
        for chain in model:
            pocket_chain_id = chain.id
            break
        chain_num = len(list(model.get_chains()))
        if chain_num > 1:
            print("error: more than 1 chain in pocket_pdb_file:", id)
            return 1
        chain = model[pocket_chain_id]
        for residue in chain:
            if residue.id[0] != " ":
                continue
            in_pocket = False
            for atom in residue:
                if tuple(atom.get_coord()) in pocket_atom_coordinates:
                    in_pocket = True
                    break
            if in_pocket:
                sequence += self.aa_3_to_1[residue.resname]
            else:
                sequence += "-"

        # save
        sequence_position_file = (
            self.homoaug_dir + "/" + id + "/" + id + "_pocket_position.txt"
        )
        with open(sequence_position_file, "w") as f:
            f.write(sequence)

    @ray.remote(num_cpus=1)
    def process_jobs(self, id, jobs_queue, actor):
        print("start process", id)
        while not jobs_queue.empty():
            job = jobs_queue.get()
            try:
                self.execute_one_job(job)
            except:
                print(f"failed: {job}")
                traceback.print_exception(*sys.exc_info())
            try:
                actor.update.remote(1)
            except:
                pass
        return 1

    def run(self):
        all_jobs = []
        for id in os.listdir(self.homoaug_dir):
            if os.path.isfile(self.homoaug_dir + "/" + id + "/" + id + "_ligand.pdb"):
                if os.path.isfile(
                    self.homoaug_dir + "/" + id + "/" + id + "_pocket.pdb"
                ):
                    all_jobs.append(id)
        uncompleted_jobs = all_jobs
        job_queue = Queue()
        for job in tqdm(uncompleted_jobs):
            job_queue.put(job)
        print("job queue size:", job_queue.qsize())
        pb = ProgressBar(len(all_jobs))
        actor = pb.actor
        job_id_list = []
        self.n_thread = min(self.n_thread, len(uncompleted_jobs))
        for i in range(self.n_thread):
            job_id_list.append(self.process_jobs.remote(self, i, job_queue, actor))
        pb.print_until_done()
        result = ray.get(job_id_list)
        ray.shutdown()
        print("Done!")


class TMalignRunner:
    def __init__(
        self,
        max_extend_num,
        homoaug_dir,
        MSA_dir,
        AF2DB_dir,
        TMscore_threshold,
        Match_rate_threshold,
        n_thread,
    ):
        self.n_thread = n_thread
        self.max_extend_num = max_extend_num
        self.homoaug_dir = homoaug_dir
        self.MSA_dir = MSA_dir
        self.AF2DB_dir = AF2DB_dir
        self.TMscore_threshold = TMscore_threshold
        self.Match_rate_threshold = Match_rate_threshold

    def _remove_gap_of_primary_sequence(self, primary_sequence, candidate_sequence):
        assert len(primary_sequence) == len(candidate_sequence)
        primary_sequence_without_gap = ""
        candidate_sequence_without_gap = ""
        for i in range(len(primary_sequence)):
            if primary_sequence[i] != "-":
                primary_sequence_without_gap += primary_sequence[i]
                candidate_sequence_without_gap += candidate_sequence[i]
        return primary_sequence_without_gap, candidate_sequence_without_gap

    def _calc_match_rate(self, pocket_position, Aligned_seq):
        total_cnt = 0
        match_cnt = 0
        for i in range(len(Aligned_seq)):
            if pocket_position[i] != "-":
                total_cnt += 1
                if pocket_position[i] == Aligned_seq[i]:
                    match_cnt += 1
        return match_cnt / total_cnt

    def _get_rotate_matrix(self, rotate_matrix_file):
        with open(rotate_matrix_file, "r") as f:
            data = f.readlines()
        u = []
        t = []
        for i in range(2, 5):
            line = data[i].split(" ")
            line_float = [float(x) for x in line if x != ""]
            t.append(line_float[1])
            u.append(line_float[2:])
        u = np.array(u)
        t = np.array(t)
        return u, t

    def _read_ligand_pdb(self, ligand_pdb_file):
        parser = Bio.PDB.PDBParser()
        structure = parser.get_structure("ligand", ligand_pdb_file)
        ligand_coords = []
        for model in structure:
            for chain in model:
                for residue in chain:
                    for atom in residue:
                        ligand_coords.append(atom.get_coord())
        return ligand_coords

    @ray.remote(num_cpus=1)
    def process_jobs(self, id, jobs_queue, actor):
        print("start process", id)
        while not jobs_queue.empty():
            job = jobs_queue.get()
            try:
                self.execute_one_job(job)

            except:
                print(f"failed: {job}")
                traceback.print_exception(*sys.exc_info())
            try:
                actor.update.remote(1)
            except:
                pass
        return 1

    def execute_one_job(self, id):
        # print("#######################")
        print(id)
        # get the sequence from pdb
        fasta_dir = self.homoaug_dir + "/" + id + "/" + id + ".fasta"
        with open(fasta_dir) as f:
            fasta = f.readlines()

        # read the pocket position
        pocket_position_file = (
            self.homoaug_dir + "/" + id + "/" + id + "_pocket_position.txt"
        )
        if not os.path.exists(pocket_position_file):
            print("position_file not exist")
            return 1
        with open(pocket_position_file) as f:
            pocket_position = f.readline().strip()

        # create rotation matrix dir
        rotation_matrix_dir = self.homoaug_dir + "/" + id + "/" + "rotation_matrix/"
        if not os.path.exists(rotation_matrix_dir):
            os.makedirs(rotation_matrix_dir)

        # get the sequence from TMalign
        chain_pdb_file = self.homoaug_dir + "/" + id + "/" + id + "_pocket_chain.pdb"
        MSA_file = self.MSA_dir + f"/{id}" + ".fasta"
        if not os.path.exists(MSA_file):
            print("MSA_file not exist")
            return 1
        MSA_ids = []
        with open(MSA_file) as f:
            lines = f.readlines()
            for idx in range(0, len(lines), 2):
                MSA_ids.append(lines[idx].strip().split(" ")[-1])

        # create extend dir
        extend_dir = self.homoaug_dir + "/" + id + "/" + "extend"
        if not os.path.exists(extend_dir):
            os.mkdir(extend_dir)

        # get ligand
        ligand_file = self.homoaug_dir + "/" + id + "/" + id + "_ligand.pdb"
        ligand_coords = self._read_ligand_pdb(ligand_file)

        # TMalign
        cnt = len(glob.glob(extend_dir + "/*"))
        for MSA_id in list(MSA_ids):
            if cnt >= self.max_extend_num:
                break
            # calculate TMscore
            MSA_pdb_file = self.AF2DB_dir + f"/{MSA_id}.pdb"
            if not os.path.exists(MSA_pdb_file):
                continue
            rotation_matrix_file = rotation_matrix_dir + f"{MSA_id}.txt"
            if os.path.exists(rotation_matrix_file):
                continue
            out_bytes = subprocess.check_output(
                ["TMalign", MSA_pdb_file, chain_pdb_file, "-m", rotation_matrix_file]
            )
            out_text = out_bytes.decode("utf-8").strip().split("\n")
            TMscore1 = float(out_text[12].split(" ")[1])
            TMscore2 = float(out_text[13].split(" ")[1])
            (
                sequence_from_TMalign,
                MSA_aligned_sequence,
            ) = self._remove_gap_of_primary_sequence(out_text[19], out_text[17])
            TMalign_file = rotation_matrix_dir + f"{MSA_id}_TMscore.txt"
            with open(TMalign_file, "w") as f:
                f.write("TMscore normalized to chain_pdb:" + str(TMscore2) + "\n")
                f.write("TMscore normalized to MSA_pdb:" + str(TMscore1) + "\n")
                f.write("Aligned sequence : \n")
                f.write(sequence_from_TMalign + "\n")
                f.write(MSA_aligned_sequence + "\n")
            TMscore = TMscore2

            # calculate Match_score
            Aligned_seq = MSA_aligned_sequence
            Match_rate = self._calc_match_rate(pocket_position, Aligned_seq)

            # print("MSA_id:",MSA_id)
            # print("TMscore:",TMscore)
            # print("Match_rate:",Match_rate)
            if (
                TMscore >= self.TMscore_threshold
                and Match_rate >= self.Match_rate_threshold
            ):
                extend_instance_dir = extend_dir + "/" + MSA_id + "/"
                # if os.path.exists(extend_instance_dir):
                #     continue
                os.mkdir(extend_instance_dir)

                # read ori MSA pdb file
                MSA_pdb_file = self.AF2DB_dir + f"/{MSA_id}" + ".pdb"
                parser = Bio.PDB.PDBParser()
                structure = parser.get_structure(MSA_id, MSA_pdb_file)
                model = structure[0]
                for chain in model:
                    MSA_chain_id = chain.id
                    break
                MSA_chain = model[MSA_chain_id]

                # get rotate_matrix
                rotation_matrix_file = rotation_matrix_dir + f"{MSA_id}.txt"
                rotation_matrix = self._get_rotate_matrix(rotation_matrix_file)

                for residue in MSA_chain:
                    for atom in residue:
                        coord = atom.get_coord()
                        coord = np.array(coord)
                        new_coord = (
                            np.dot(rotation_matrix[0], coord) + rotation_matrix[1]
                        )
                        atom.set_coord(new_coord)

                # write new pdb file
                io = Bio.PDB.PDBIO()
                io.set_structure(structure)
                io.save(extend_instance_dir + f"{MSA_id}" + "_protein.pdb")

                # get pocket , which is in the 6A of ligand
                MSA_pocket_file = extend_instance_dir + f"{MSA_id}" + "_pocket.pdb"
                for residue in MSA_chain:
                    remove_atom_ids = []
                    for atom in residue:
                        # print("atom: ",atom.id)
                        coord = atom.get_coord()
                        f = 0
                        for ligand_coord in ligand_coords:
                            dis = np.linalg.norm(coord - ligand_coord)
                            if np.linalg.norm(coord - ligand_coord) <= 6:
                                f = 1
                                break
                        if f == 0:
                            remove_atom_ids.append(atom.id)
                    for atom_id in remove_atom_ids:
                        residue.detach_child(atom_id)
                io = Bio.PDB.PDBIO()
                io.set_structure(structure)
                io.save(MSA_pocket_file)
                cnt += 1
        print("finish: pdb_id:", id)
        return 1

    def run(self):
        all_jobs = []
        for id in os.listdir(self.homoaug_dir):
            if (
                os.path.isfile(self.homoaug_dir + "/" + id + "/" + id + "_ligand.pdb")
                and os.path.isfile(
                    self.homoaug_dir + "/" + id + "/" + id + "_pocket.pdb"
                )
                and os.path.isfile(
                    self.homoaug_dir + "/" + id + "/" + id + "_pocket_position.txt"
                )
            ):
                all_jobs.append(id)
        uncompleted_jobs = all_jobs
        job_queue = Queue()
        for job in tqdm(uncompleted_jobs):
            job_queue.put(job)
        print("job queue size:", job_queue.qsize())
        pb = ProgressBar(len(all_jobs))
        actor = pb.actor
        job_id_list = []
        self.n_thread = min(self.n_thread, len(uncompleted_jobs))
        for i in range(self.n_thread):
            job_id_list.append(self.process_jobs.remote(self, i, job_queue, actor))
        pb.print_until_done()
        result = ray.get(job_id_list)
        ray.shutdown()
        print("Done!")


class HomoAugRunner:
    def __init__(self, args):
        self.id_file = args.id_file
        self.homoaug_dir = args.homoaug_dir
        self.fasta_file = args.fasta_file
        self.protein_pdb_dir = args.protein_pdb_dir
        self.pocket_pdbs_dir = args.pocket_pdbs_dir
        self.jackhmmer_output_dir = args.jackhmmer_output_dir
        self.database_fasta_path = args.database_fasta_path
        self.max_extend_num = args.max_extend_num
        self.database_pdb_dir = args.database_pdb_dir
        self.TMscore_threshold = args.TMscore_threshold
        self.Match_rate_threshold = args.Match_rate_threshold
        self.n_thread = args.n_thread

    def read_dataset_fasta(self):
        protein_seq = {}
        with open(self.fasta_file) as f:
            fasta = f.readlines()
            for i in range(0, len(fasta), 2):
                id = fasta[i].strip()
                id = id[1:5].upper()
                seq = fasta[i + 1].strip()
                protein_seq[id] = seq
        return protein_seq

    def create_dir(self):
        # Create homoaug dir and subdirs
        # Dir format
        # homoaug_dir
        # └── id
        #     ├── id.fasta
        #     └── id_protein.pdb
        #     └── id_pocket.pdb

        protein_seq = self.read_dataset_fasta()
        if not os.path.exists(self.homoaug_dir):
            os.mkdir(self.homoaug_dir)
        with open(self.id_file, "r") as f:
            lines = f.readlines()
        for line in tqdm(lines):
            id = line.strip()
            id = id[:4].upper()
            if not os.path.exists(self.homoaug_dir + "/" + id):
                os.mkdir(self.homoaug_dir + "/" + id)

                # create fasta
                with open(self.homoaug_dir + "/" + id + "/" + id + ".fasta", "w") as f:
                    f.write(">" + id + "\n" + protein_seq[id] + "\n")

                # copy pdb
                cif_file = glob.glob(self.protein_pdb_dir + "/" + id + "*")
                if len(cif_file) != 1:
                    print("error : having more than 1 cif file", id)
                    continue
                cif_file = cif_file[0]
                # read cif
                parser = Bio.PDB.MMCIFParser(QUIET=True)
                structure = parser.get_structure(id, cif_file)
                # get number of models
                n_model = len(structure)
                if n_model != 1:
                    print("error : having more than 1 model", id)
                    continue
                # get number of chains
                n_chain = len(list(structure.get_chains()))
                if n_chain != 1:
                    print("error : having more than 1 chain", id)
                    continue
                # save pdb
                io = Bio.PDB.PDBIO()
                io.set_structure(structure)
                io.save(self.homoaug_dir + "/" + id + "/" + id + "_protein.pdb")
                # os.system("cp "+homoaug_dir+"/"+id+"/"+id+"_protein.pdb "+homoaug_dir+"/"+id+"/"+id+"_pocket_chain.pdb")

                # copy pocket
                pocket_file = glob.glob(self.pocket_pdbs_dir + "/" + id + "*")
                if len(pocket_file) != 1:
                    print("error : having more than 1 pocket file", id)
                    continue
                pocket_file = pocket_file[0]
                # copy to homoaug
                os.system(
                    "cp "
                    + pocket_file
                    + " "
                    + self.homoaug_dir
                    + "/"
                    + id
                    + "/"
                    + id
                    + "_pocket.pdb"
                )

    def run(self):
        self.create_dir()

        # Run jackhmmer
        print("# Start running jackhmmer")
        jackhmmer_runner = JackhmmerRunner(
            database_dir=self.database_fasta_path,
            task_ids=self.id_file,
            seq_dir=self.homoaug_dir,
            output_dir=self.jackhmmer_output_dir,
            n_thread=self.n_thread,
        )
        jackhmmer_runner.run_jackhmmer()

        print("# Start running ligand pocket extractor")
        ligand_pocket_extractor = LigandPocketExtractor(
            self.id_file, self.homoaug_dir, self.n_thread
        )
        ligand_pocket_extractor.run()

        print("# Start running pocket position extractor")
        pocket_position_extractor = PocketPositionExtractor(
            self.homoaug_dir, self.n_thread
        )
        pocket_position_extractor.run()

        print("# Start running TMalign")
        tmalign_runner = TMalignRunner(
            self.max_extend_num,
            self.homoaug_dir,
            self.jackhmmer_output_dir,
            self.database_pdb_dir,
            self.TMscore_threshold,
            self.Match_rate_threshold,
            self.n_thread,
        )
        tmalign_runner.run()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--id_file", type=str, default="/drug/BioLip/tmp.id")
    parser.add_argument("--homoaug_dir", type=str, default="/drug/BioLip/homoaug_new")
    parser.add_argument(
        "--fasta_file",
        type=str,
        default="/drug/BioLip/BioLiP_v2023-04-13_regularLigand.fasta",
    )
    parser.add_argument(
        "--protein_pdb_dir", type=str, default="/drug/BioLip/protein_pdb"
    )
    parser.add_argument(
        "--pocket_pdbs_dir", type=str, default="/drug/BioLip/pocket_pdb"
    )
    parser.add_argument(
        "--jackhmmer_output_dir", type=str, default="/drug/BioLip/pdbbind_MSA_fasta"
    )
    parser.add_argument(
        "--n_thread", type=int, default=10, help="number of threads for running"
    )
    parser.add_argument(
        "--database_fasta_path",
        type=str,
        default="/data/protein/AF2DB/AFDB_HC_50.fa",
        help="jackhmmer search database, in fasta format",
    )
    parser.add_argument(
        "--database_pdb_dir",
        type=str,
        default="/drug/AFDB_HC_50_PDB",
        help="homoaug search database, e.g. AF2DB",
    )
    parser.add_argument(
        "--max_extend_num",
        type=int,
        default=20,
        help="max number of extended pocket-ligand pairs for one real pocket-ligand pair",
    )
    parser.add_argument(
        "--TMscore_threshold",
        type=float,
        default=0.4,
        help="TMscore threshold for extending",
    )
    parser.add_argument(
        "--Match_rate_threshold",
        type=float,
        default=0.4,
        help="Match_rate threshold for extending",
    )

    args = parser.parse_args()
    homoaug_runner = HomoAugRunner(args)
    homoaug_runner.run()
    print("HomoAug Done!")