# Li Song: the program to re-cluster the CDR3 regions
#!/usr/bin/env python3
import sys
def GetChainType(v, j, c):
s = ""
if (v != "*"):
s = v
elif (c != "*"):
s = c
elif (j != "*"):
s = j
else:
return -1
if (s[0:3] == "IGH"):
return 0
elif (s[0:3] == "IGK"):
return 1
elif (s[0:3] == "IGL"):
return 2
elif (s[0:3] == "TRA"):
return 3
elif (s[0:3] == "TRB"):
return 4
elif (s[0:3] == "TRG"):
return 5
elif (s[0:3] == "TRD"):
return 6
else:
return -1
def GetMainGeneName(g):
return g.split("*")[0]
def CompatibleGeneAssignment(a, b):
if ( GetChainType( a[0], a[2], a[3] ) != GetChainType(b[0], b[2], b[3]) ):
return False
for i in [0, 2]: # V, J gene assignment
ga = a[i].split("*")[0]
gb = b[i].split("*")[0]
if ( ga != "" and gb != "" and ga != gb ):
return False
return True
def CompatibleSequence(a, b, similarity):
if (len(a) != len(b)):
return False ;
diffCnt = 0
diffMax = len(a) - int(len(a) * similarity)
for i in range(len(a)):
if (a[i] != b[i]):
diffCnt += 1
if (diffCnt > diffMax):
return False
return True
def GetFather(tag, father):
if ( father[tag] != tag ):
father[tag] = GetFather( father[tag], father )
return father[tag]
def LargerCluster(cdr3List, similarity):
vjCDR3LenList = {}
clusterNameToId = {}
clusterIdToName = []
for cdr3 in cdr3List:
if ( cdr3[0] not in clusterNameToId ):
clusterNameToId[cdr3[0]] = len(clusterIdToName)
clusterIdToName.append(cdr3[0])
# Reorganize the CDR3 list by their V, J and CDR3 length.
i = 0
for cdr3 in cdr3List:
key = (GetMainGeneName(cdr3[2]), GetMainGeneName(cdr3[4]), len(cdr3[8]))
if (key not in vjCDR3LenList):
vjCDR3LenList[key] = []
vjCDR3LenList[key].append(i)
i += 1
# Prepare the set-union
father = []
clusterNameFirstId = {}
i = 0
for cdr3 in cdr3List:
father.append(i)
if (cdr3[0] in clusterNameFirstId ):
father[i] = clusterNameFirstId[cdr3[0]]
else:
clusterNameFirstId[cdr3[0]] = i
i += 1
# Build up the set-union relation
for key in vjCDR3LenList.keys():
cdr3IdList = vjCDR3LenList[key]
size = len( cdr3IdList )
for i in range(size):
for j in range(i + 1, size):
if ( GetFather( cdr3IdList[i], father ) != GetFather( cdr3IdList[j], father )
and CompatibleSequence( cdr3List[ cdr3IdList[i] ][8], cdr3List[ cdr3IdList[j] ][8],
similarity ) ):
father[ cdr3IdList[j] ] = cdr3IdList[i]
# Associate original clusters to larger cluster
largerClusterToId = []
largerClusterToClusterName = []
rootToLargerClusterId = {}
i = 0
for cdr3 in cdr3List:
root = GetFather(i, father)
lcId = 0
if (root not in rootToLargerClusterId):
rootToLargerClusterId[root] = len(largerClusterToId)
lcId = len(largerClusterToId)
largerClusterToId.append([])
largerClusterToClusterName.append(set({}))
else:
lcId = rootToLargerClusterId[root]
largerClusterToId[lcId].append(i)
largerClusterToClusterName[lcId].add(cdr3[0])
i += 1
# Output the result
# Output the composition of larger cluster
#for i in range(len(largerClusterToId)):
# print("#\tcluster" + str(i), end = "")
# for name in largerClusterToClusterName[i]:
# print("\t" + name, end = "")
# print("\n", end = "")
# Output the new cdr3 format with new larger cluster Id.
for i in range(len(largerClusterToId)):
j = 0
for cdr3Id in largerClusterToId[i]:
cdr3List[cdr3Id].append(cdr3List[cdr3Id][0])
cdr3List[cdr3Id].append(cdr3List[cdr3Id][1])
cdr3List[cdr3Id][0] = "cluster" + str(i) #+ "_" + cdr3List[cdr3Id][0] + "_" + str(cdr3List[cdr3Id][1])
cdr3List[cdr3Id][1] = j
print( "\t".join( str(x) for x in cdr3List[cdr3Id] ) )
j += 1
return
if (__name__ == "__main__"):
if (len(sys.argv) <= 1):
print("usage: a.py trust_cdr3.out [OPTIONS]\n" +
"OPTIONS:\n"
"\t-s FLOAT: similarity of two CDR3s (default: 0.95)")
exit(1)
cdr3List = []
similarity = 0.95 ;
i = 2
while (i < len(sys.argv)):
if (sys.argv[i] == "-s"):
similarity = float(sys.argv[i + 1])
i += 1
else:
print("Unknown option: ", sys.argv[i])
exit(1)
i += 1
fp = open(sys.argv[1])
for line in fp:
line = line.rstrip()
cols = line.split("\t")
cols[1] = int(cols[1])
skip = False
for g in [2, 4]: # Must have V, J genes.
if ( cols[g] == "*"):
skip = True
if (float(cols[9]) == 0):
skip = True
if ( skip ):
continue
for g in [2, 3, 4, 5]:
cols[g] = cols[g].split(",")[0]
cols[9] = float(cols[9])
cols[10] = float(cols[10])
if (cols[9] == 0):
continue
cdr3List.append(cols)
LargerCluster(cdr3List, similarity)