|
a |
|
b/aggmap/utils/gen_nwk.py |
|
|
1 |
# -*- coding: utf-8 -*- |
|
|
2 |
""" |
|
|
3 |
Created on Fri Aug 27 14:06:17 2021 |
|
|
4 |
|
|
|
5 |
@author: Shen Wanxiang |
|
|
6 |
""" |
|
|
7 |
|
|
|
8 |
import csv |
|
|
9 |
from collections import defaultdict |
|
|
10 |
from pprint import pprint |
|
|
11 |
import pandas as pd |
|
|
12 |
from scipy.cluster.hierarchy import dendrogram, linkage, to_tree |
|
|
13 |
from scipy.spatial.distance import squareform |
|
|
14 |
|
|
|
15 |
itol_header = '''TREE_COLORS |
|
|
16 |
SEPARATOR TAB |
|
|
17 |
|
|
|
18 |
#First 3 fields define the node id, type and color |
|
|
19 |
#Possible types are: |
|
|
20 |
#'range': defines a colored range (colored background for labels/clade) |
|
|
21 |
#'clade': defines color/style for all branches in a clade |
|
|
22 |
#'branch': defines color/style for a single branch |
|
|
23 |
#'label': defines font color/style for the leaf label |
|
|
24 |
#'label_background': defines the leaf label background color |
|
|
25 |
|
|
|
26 |
#The following additional fields are required: |
|
|
27 |
#for 'range', field 4 defines the colored range label (used in the legend) |
|
|
28 |
|
|
|
29 |
#The following additional fields are optional: |
|
|
30 |
#for 'label', field 4 defines the font style ('normal',''bold', 'italic' or 'bold-italic') and field 5 defines the numeric scale factor for the font size (eg. with value 2, font size for that label will be 2x the standard size) |
|
|
31 |
#for 'clade' and 'branch', field 4 defines the branch style ('normal' or 'dashed') and field 5 defines the branch width scale factor (eg. with value 0.5, branch width for that clade will be 0.5 the standard width) |
|
|
32 |
|
|
|
33 |
DATA |
|
|
34 |
#NODE_ID TYPE COLOR LABEL_OR_STYLE SIZE_FACTOR |
|
|
35 |
''' |
|
|
36 |
|
|
|
37 |
def _getNewick(node, newick, parentdist, leaf_names): |
|
|
38 |
if node.is_leaf(): |
|
|
39 |
return "%s:%.2f%s" % (leaf_names[node.id], parentdist - node.dist, newick) |
|
|
40 |
else: |
|
|
41 |
if len(newick) > 0: |
|
|
42 |
newick = "):%.2f%s" % (parentdist - node.dist, newick) |
|
|
43 |
else: |
|
|
44 |
newick = ");" |
|
|
45 |
newick = _getNewick(node.get_left(), newick, node.dist, leaf_names) |
|
|
46 |
newick = _getNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names) |
|
|
47 |
newick = "(%s" % (newick) |
|
|
48 |
return newick |
|
|
49 |
|
|
|
50 |
def mp2newick(mp, treefile = 'phenotype_tree', leaf_names = None): |
|
|
51 |
|
|
|
52 |
df = mp.df_embedding[['colors','Subtypes']] |
|
|
53 |
|
|
|
54 |
if leaf_names == None: |
|
|
55 |
leaf_names = mp.flist #use final list only |
|
|
56 |
else: |
|
|
57 |
assert len(leaf_names) == len(df), 'leaf names should be a list with a length of %s' % len(df) |
|
|
58 |
df.index = leaf_names |
|
|
59 |
|
|
|
60 |
linkage_matrix = mp.Z |
|
|
61 |
tree = to_tree(linkage_matrix, rd=False) |
|
|
62 |
newick = _getNewick(tree, "", tree.dist, leaf_names = leaf_names) |
|
|
63 |
|
|
|
64 |
# write newick file for itol |
|
|
65 |
with open(treefile + '.nwk', 'w') as f: |
|
|
66 |
f.write(newick) |
|
|
67 |
|
|
|
68 |
# write dataset file for itol |
|
|
69 |
df['TYPE'] = 'clade' |
|
|
70 |
df['STYLE'] = 'normal' |
|
|
71 |
df = df[['TYPE', 'colors', 'STYLE']] |
|
|
72 |
with open(treefile + '.txt', 'w') as f: |
|
|
73 |
f.write(itol_header) |
|
|
74 |
df.to_csv(treefile + '.txt', mode = 'a', header=None, sep='\t') |
|
|
75 |
return df |
|
|
76 |
|
|
|
77 |
def tree(): |
|
|
78 |
return defaultdict(tree) |
|
|
79 |
|
|
|
80 |
def tree_add(t, path): |
|
|
81 |
for node in path: |
|
|
82 |
t = t[node] |
|
|
83 |
|
|
|
84 |
def pprint_tree(tree_instance): |
|
|
85 |
def dicts(t): return {k: dicts(t[k]) for k in t} |
|
|
86 |
pprint(dicts(tree_instance)) |
|
|
87 |
|
|
|
88 |
def dfs_to_tree(dfs): |
|
|
89 |
t = tree() |
|
|
90 |
for i in range(len(dfs)): |
|
|
91 |
row = dfs.iloc[i].dropna().tolist() |
|
|
92 |
tree_add(t, row) |
|
|
93 |
return t |
|
|
94 |
|
|
|
95 |
def tree_to_newick(root): |
|
|
96 |
items = [] |
|
|
97 |
for k in root.keys(): |
|
|
98 |
s = '' |
|
|
99 |
if len(root[k].keys()) > 0: |
|
|
100 |
sub_tree = tree_to_newick(root[k]) |
|
|
101 |
if sub_tree != '': |
|
|
102 |
s += '(' + sub_tree + ')' |
|
|
103 |
s += k |
|
|
104 |
items.append(s) |
|
|
105 |
return ','.join(items) |
|
|
106 |
|
|
|
107 |
def dfs_to_weightless_newick(dfs): |
|
|
108 |
t = dfs_to_tree(dfs) |
|
|
109 |
newick_tree = tree_to_newick(t) |
|
|
110 |
return newick_tree |
|
|
111 |
|
|
|
112 |
|
|
|
113 |
if __name__ == '__main__': |
|
|
114 |
|
|
|
115 |
species_list = pd.read_csv('./species.list.csv', header=None,index_col=0)[1].to_list() |
|
|
116 |
dfs = pd.Series(species_list).apply(lambda x: dict([i.split('__') for i in x.split('|')])).apply(pd.Series) |
|
|
117 |
level_dict = {'k':'kingdom', 'p':'phylum', 'c':'class' ,'o':'order' ,'f':'family' ,'g': 'genus','s': 'species'} |
|
|
118 |
dfs = dfs.rename(columns=level_dict) |
|
|
119 |
nwk_string = dfs_to_weightless_newick(dfs) |
|
|
120 |
with open("1.nwk", "w") as f: |
|
|
121 |
f.write(nwk_string) |
|
|
122 |
|