Diff of /src/trust-cluster.py [000000] .. [fb5156]

Switch to side-by-side view

--- a
+++ b/src/trust-cluster.py
@@ -0,0 +1,187 @@
+# Li Song: the program to re-cluster the CDR3 regions
+
+#!/usr/bin/env python3
+
+import sys
+
+def GetChainType(v, j, c):
+	s = ""
+	if (v != "*"):
+		s = v
+	elif (c != "*"):
+		s = c 
+	elif (j != "*"):
+		s = j
+	else:
+		return -1
+	
+	if (s[0:3] == "IGH"):
+		return 0
+	elif (s[0:3] == "IGK"):
+		return 1
+	elif (s[0:3] == "IGL"):
+		return 2
+	elif (s[0:3] == "TRA"):
+		return 3
+	elif (s[0:3] == "TRB"):
+		return 4
+	elif (s[0:3] == "TRG"):
+		return 5
+	elif (s[0:3] == "TRD"):
+		return 6
+	else:
+		return -1
+
+def GetMainGeneName(g):
+	return g.split("*")[0] 
+
+def CompatibleGeneAssignment(a, b):
+	if ( GetChainType( a[0], a[2], a[3] ) != GetChainType(b[0], b[2], b[3]) ):
+		return False 
+	
+	for i in [0, 2]: # V, J gene assignment
+		ga = a[i].split("*")[0]
+		gb = b[i].split("*")[0]
+		if ( ga != "" and gb != "" and ga != gb ):
+			return False
+	return True
+
+def CompatibleSequence(a, b, similarity):
+	if (len(a) != len(b)):
+		return False ;
+	diffCnt = 0 
+	diffMax = len(a) - int(len(a) * similarity)
+	for i in range(len(a)):
+		if (a[i] != b[i]):
+			diffCnt += 1
+			if (diffCnt > diffMax):
+				return False
+	return True
+
+def GetFather(tag, father):
+	if ( father[tag] != tag ):
+		father[tag] = GetFather( father[tag], father )
+	return father[tag]
+	
+def LargerCluster(cdr3List, similarity):
+	vjCDR3LenList = {}
+	clusterNameToId = {}
+	clusterIdToName = []
+	for cdr3 in cdr3List:
+		if ( cdr3[0] not in clusterNameToId ):
+			clusterNameToId[cdr3[0]] = len(clusterIdToName)
+			clusterIdToName.append(cdr3[0])
+
+	# Reorganize the CDR3 list by their V, J and CDR3 length.
+	i = 0
+	for cdr3 in cdr3List:
+		key = (GetMainGeneName(cdr3[2]), GetMainGeneName(cdr3[4]), len(cdr3[8]))
+		if (key not in vjCDR3LenList):
+			vjCDR3LenList[key] = []
+		vjCDR3LenList[key].append(i)
+		i += 1
+	# Prepare the set-union
+	father = []
+	clusterNameFirstId = {}
+	i = 0
+	for cdr3 in cdr3List:
+		father.append(i)
+		if (cdr3[0] in clusterNameFirstId ):
+			father[i] = clusterNameFirstId[cdr3[0]]
+		else:
+			clusterNameFirstId[cdr3[0]] = i
+		i += 1
+	
+	# Build up the set-union relation
+	for key in vjCDR3LenList.keys():
+		cdr3IdList = vjCDR3LenList[key]
+		size = len( cdr3IdList )
+		for i in range(size):
+			for j in range(i + 1, size):
+				if ( GetFather( cdr3IdList[i], father ) != GetFather( cdr3IdList[j], father ) 
+					and CompatibleSequence( cdr3List[ cdr3IdList[i] ][8], cdr3List[ cdr3IdList[j] ][8], 
+					similarity ) ):
+					father[ cdr3IdList[j] ] = cdr3IdList[i]
+					
+	# Associate original clusters to larger cluster
+	largerClusterToId = []
+	largerClusterToClusterName = []
+	rootToLargerClusterId = {}
+	i = 0 
+	for cdr3 in cdr3List:
+		root = GetFather(i, father)
+		lcId = 0
+		if (root not in rootToLargerClusterId):
+			rootToLargerClusterId[root] = len(largerClusterToId)
+			lcId = len(largerClusterToId)
+			largerClusterToId.append([])
+			largerClusterToClusterName.append(set({}))
+		else:
+			lcId = rootToLargerClusterId[root]
+
+		largerClusterToId[lcId].append(i)
+		largerClusterToClusterName[lcId].add(cdr3[0])
+		i += 1		
+	
+	# Output the result
+	# Output the composition of larger cluster
+	#for i in range(len(largerClusterToId)):
+	#	print("#\tcluster" + str(i), end = "")
+	#	for name in largerClusterToClusterName[i]:
+	#		print("\t" + name, end = "")
+	#	print("\n", end = "")
+	
+	# Output the new cdr3 format with new larger cluster Id.
+	for i in range(len(largerClusterToId)):
+		j = 0
+		for cdr3Id in largerClusterToId[i]:
+			cdr3List[cdr3Id].append(cdr3List[cdr3Id][0])
+			cdr3List[cdr3Id].append(cdr3List[cdr3Id][1])
+			cdr3List[cdr3Id][0] = "cluster" + str(i) #+ "_" + cdr3List[cdr3Id][0] + "_" + str(cdr3List[cdr3Id][1])
+			cdr3List[cdr3Id][1] = j
+			print( "\t".join( str(x) for x in cdr3List[cdr3Id] ) )
+			j += 1
+
+	return 
+
+
+if (__name__ == "__main__"):
+	if (len(sys.argv) <= 1):
+		print("usage: a.py trust_cdr3.out [OPTIONS]\n" + 
+			"OPTIONS:\n" 
+			"\t-s FLOAT: similarity of two CDR3s (default: 0.95)") 	
+		exit(1)
+	cdr3List = []
+	similarity = 0.95 ;
+	i = 2
+	while (i < len(sys.argv)):
+		if (sys.argv[i] == "-s"):
+			similarity = float(sys.argv[i + 1])
+			i += 1
+		else:
+			print("Unknown option: ", sys.argv[i])
+			exit(1)
+		i += 1
+
+	fp = open(sys.argv[1])
+	for line in fp:
+		line = line.rstrip()
+		cols = line.split("\t")
+		cols[1] = int(cols[1])
+		skip = False
+		for g in [2, 4]: # Must have V, J genes.
+			if ( cols[g] == "*"):
+				skip = True 
+		if (float(cols[9]) == 0):
+			skip = True
+		if ( skip ):
+			continue 
+
+		for g in [2, 3, 4, 5]:
+			cols[g] = cols[g].split(",")[0]
+		cols[9] = float(cols[9])
+		cols[10] = float(cols[10])
+		if (cols[9] == 0):
+			continue
+		cdr3List.append(cols)
+	LargerCluster(cdr3List, similarity)