Diff of /aggmap/utils/gen_nwk.py [000000] .. [9e8054]

Switch to unified view

a b/aggmap/utils/gen_nwk.py
1
# -*- coding: utf-8 -*-
2
"""
3
Created on Fri Aug 27 14:06:17 2021
4
5
@author: Shen Wanxiang
6
"""
7
8
import csv
9
from collections import defaultdict
10
from pprint import pprint
11
import pandas as pd
12
from scipy.cluster.hierarchy import dendrogram, linkage, to_tree
13
from scipy.spatial.distance import squareform
14
15
itol_header = '''TREE_COLORS
16
SEPARATOR TAB
17
18
#First 3 fields define the node id, type and color
19
#Possible types are:
20
#'range': defines a colored range (colored background for labels/clade)
21
#'clade': defines color/style for all branches in a clade
22
#'branch': defines color/style for a single branch
23
#'label': defines font color/style for the leaf label
24
#'label_background': defines the leaf label background color
25
26
#The following additional fields are required:
27
#for 'range', field 4 defines the colored range label (used in the legend)
28
29
#The following additional fields are optional:
30
#for 'label', field 4 defines the font style ('normal',''bold', 'italic' or 'bold-italic') and field 5 defines the numeric scale factor for the font size (eg. with value 2, font size for that label will be 2x the standard size)
31
#for 'clade' and 'branch', field 4 defines the branch style ('normal' or 'dashed') and field 5 defines the branch width scale factor (eg. with value 0.5, branch width for that clade will be 0.5 the standard width)
32
33
DATA
34
#NODE_ID TYPE COLOR LABEL_OR_STYLE SIZE_FACTOR
35
'''
36
37
def _getNewick(node, newick, parentdist, leaf_names):
38
    if node.is_leaf():
39
        return "%s:%.2f%s" % (leaf_names[node.id], parentdist - node.dist, newick)
40
    else:
41
        if len(newick) > 0:
42
            newick = "):%.2f%s" % (parentdist - node.dist, newick)
43
        else:
44
            newick = ");"
45
        newick = _getNewick(node.get_left(), newick, node.dist, leaf_names)
46
        newick = _getNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names)
47
        newick = "(%s" % (newick)
48
        return newick
49
    
50
def mp2newick(mp, treefile = 'phenotype_tree', leaf_names = None):
51
    
52
    df = mp.df_embedding[['colors','Subtypes']]
53
    
54
    if leaf_names == None:
55
        leaf_names = mp.flist #use final list only
56
    else:
57
        assert len(leaf_names) == len(df), 'leaf names should be a list with a length of %s' % len(df)
58
        df.index = leaf_names
59
        
60
    linkage_matrix = mp.Z
61
    tree = to_tree(linkage_matrix, rd=False)
62
    newick = _getNewick(tree, "", tree.dist, leaf_names = leaf_names)
63
    
64
    # write newick file for itol
65
    with open(treefile + '.nwk', 'w') as f:
66
        f.write(newick)
67
68
    # write dataset file for itol
69
    df['TYPE'] = 'clade'
70
    df['STYLE'] = 'normal'
71
    df = df[['TYPE', 'colors', 'STYLE']]
72
    with open(treefile + '.txt', 'w') as f:
73
        f.write(itol_header)
74
    df.to_csv(treefile + '.txt', mode = 'a', header=None, sep='\t')
75
    return df
76
    
77
def tree(): 
78
    return defaultdict(tree)
79
80
def tree_add(t, path):
81
    for node in path:
82
        t = t[node]
83
84
def pprint_tree(tree_instance):
85
    def dicts(t): return {k: dicts(t[k]) for k in t}
86
    pprint(dicts(tree_instance))
87
88
def dfs_to_tree(dfs):
89
    t = tree()
90
    for i in range(len(dfs)):
91
        row = dfs.iloc[i].dropna().tolist()
92
        tree_add(t, row)
93
    return t
94
95
def tree_to_newick(root):
96
    items = []
97
    for k in root.keys():
98
        s = ''
99
        if len(root[k].keys()) > 0:
100
            sub_tree = tree_to_newick(root[k])
101
            if sub_tree != '':
102
                s += '(' + sub_tree + ')'
103
        s += k
104
        items.append(s)
105
    return ','.join(items)
106
107
def dfs_to_weightless_newick(dfs):
108
    t = dfs_to_tree(dfs)
109
    newick_tree = tree_to_newick(t)
110
    return newick_tree
111
112
113
if __name__ == '__main__':
114
    
115
    species_list = pd.read_csv('./species.list.csv', header=None,index_col=0)[1].to_list()
116
    dfs = pd.Series(species_list).apply(lambda x: dict([i.split('__') for i in x.split('|')])).apply(pd.Series)
117
    level_dict = {'k':'kingdom', 'p':'phylum', 'c':'class' ,'o':'order' ,'f':'family' ,'g': 'genus','s': 'species'}
118
    dfs = dfs.rename(columns=level_dict)
119
    nwk_string = dfs_to_weightless_newick(dfs)
120
    with open("1.nwk", "w") as f:
121
        f.write(nwk_string)
122