annotate format_input.py @ 3:5705c67a8be4 draft

Uploaded
author george-weingart
date Mon, 18 Aug 2014 02:33:43 -0400
parents 47ac77f2fe68
children a6284ef17bf3
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
1 #!/usr/bin/env python
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
2
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
3 import sys,os,argparse,pickle,re
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
4
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
5 def read_input_file(inp_file):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
6 with open(inp_file) as inp:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
7 return [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
8
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
9 def transpose(data):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
10 return zip(*data)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
11
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
12 def read_params(args):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
13 parser = argparse.ArgumentParser(description='LEfSe formatting modules')
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
14 parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file, feature hierarchical level can be specified with | or . and those symbols must not be present for other reasons in the input file.")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
15 parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str,
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
16 help="the output file containing the data for LEfSe")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
17 parser.add_argument('--output_table', type=str, required=False, default="",
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
18 help="the formatted table in txt format")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
19 parser.add_argument('-f',dest="feats_dir", choices=["c","r"], type=str, default="r",
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
20 help="set whether the features are on rows (default) or on columns")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
21 parser.add_argument('-c',dest="class", metavar="[1..n_feats]", type=int, default=1,
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
22 help="set which feature use as class (default 1)")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
23 parser.add_argument('-s',dest="subclass", metavar="[1..n_feats]", type=int, default=None,
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
24 help="set which feature use as subclass (default -1 meaning no subclass)")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
25 parser.add_argument('-o',dest="norm_v", metavar="float", type=float, default=-1.0,
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
26 help="set the normalization value (default -1.0 meaning no normalization)")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
27 parser.add_argument('-u',dest="subject", metavar="[1..n_feats]", type=int, default=None,
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
28 help="set which feature use as subject (default -1 meaning no subject)")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
29 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d",
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
30 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
31 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10,
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
32 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
33
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
34 args = parser.parse_args()
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
35
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
36 return vars(args)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
37
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
38 def remove_missing(data,roc):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
39 if roc == "c": data = transpose(data)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
40 max_len = max([len(r) for r in data])
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
41 to_rem = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
42 for i,r in enumerate(data):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
43 if len([v for v in r if not( v == "" or v.isspace())]) < max_len: to_rem.append(i)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
44 if len(to_rem):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
45 for i in to_rem.reverse():
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
46 data.pop(i)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
47 if roc == "c": return transpose(data)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
48 return data
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
49
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
50
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
51 def sort_by_cl(data,n,c,s,u):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
52 def sort_lines1(a,b):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
53 return int(a[c] > b[c])*2-1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
54 def sort_lines2u(a,b):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
55 if a[c] != b[c]: return int(a[c] > b[c])*2-1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
56 return int(a[u] > b[u])*2-1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
57 def sort_lines2s(a,b):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
58 if a[c] != b[c]: return int(a[c] > b[c])*2-1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
59 return int(a[s] > b[s])*2-1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
60 def sort_lines3(a,b):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
61 if a[c] != b[c]: return int(a[c] > b[c])*2-1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
62 if a[s] != b[s]: return int(a[s] > b[s])*2-1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
63 return int(a[u] > b[u])*2-1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
64 if n == 3: data.sort(sort_lines3)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
65 if n == 2:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
66 if s is None:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
67 data.sort(sort_lines2u)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
68 else:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
69 data.sort(sort_lines2s)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
70 if n == 1: data.sort(sort_lines1)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
71 return data
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
72
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
73 def group_small_subclasses(cls,min_subcl):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
74 last = ""
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
75 n = 0
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
76 repl = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
77 dd = [list(cls['class']),list(cls['subclass'])]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
78 for d in dd:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
79 if d[1] != last:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
80 if n < min_subcl and last != "":
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
81 repl.append(d[1])
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
82 last = d[1]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
83 n = 1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
84 for i,d in enumerate(dd):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
85 if d[1] in repl: dd[i][1] = "other"
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
86 dd[i][1] = str(dd[i][0])+"_"+str(dd[i][1])
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
87 cls['class'] = dd[0]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
88 cls['subclass'] = dd[1]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
89 return cls
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
90
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
91 def get_class_slices(data):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
92 previous_class = data[0][0]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
93 previous_subclass = data[0][1]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
94 subclass_slices = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
95 class_slices = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
96 last_cl = 0
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
97 last_subcl = 0
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
98 class_hierarchy = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
99 subcls = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
100 for i,d in enumerate(data):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
101 if d[1] != previous_subclass:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
102 subclass_slices.append((previous_subclass,(last_subcl,i)))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
103 last_subcl = i
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
104 subcls.append(previous_subclass)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
105 if d[0] != previous_class:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
106 class_slices.append((previous_class,(last_cl,i)))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
107 class_hierarchy.append((previous_class,subcls))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
108 subcls = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
109 last_cl = i
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
110 previous_subclass = d[1]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
111 previous_class = d[0]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
112 subclass_slices.append((previous_subclass,(last_subcl,i+1)))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
113 subcls.append(previous_subclass)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
114 class_slices.append((previous_class,(last_cl,i+1)))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
115 class_hierarchy.append((previous_class,subcls))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
116 return dict(class_slices), dict(subclass_slices), dict(class_hierarchy)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
117
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
118 def numerical_values(feats,norm):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
119 mm = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
120 for k,v in feats.items():
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
121 feats[k] = [float(val) for val in v]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
122 if norm < 0.0: return feats
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
123 tr = zip(*(feats.values()))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
124 mul = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
125 fk = feats.keys()
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
126 hie = True if sum([k.count(".") for k in fk]) > len(fk) else False
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
127 for i in range(len(feats.values()[0])):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
128 if hie: mul.append(sum([t for j,t in enumerate(tr[i]) if fk[j].count(".") < 1 ]))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
129 else: mul.append(sum(tr[i]))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
130 if hie and sum(mul) == 0:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
131 mul = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
132 for i in range(len(feats.values()[0])):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
133 mul.append(sum(tr[i]))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
134 for i,m in enumerate(mul):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
135 if m == 0: mul[i] = 0.0
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
136 else: mul[i] = float(norm) / m
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
137 for k,v in feats.items():
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
138 feats[k] = [val*mul[i] for i,val in enumerate(v)]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
139 return feats
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
140
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
141 def add_missing_levels2(ff):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
142
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
143 if sum( [f.count(".") for f in ff] ) < 1: return ff
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
144
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
145 dn = {}
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
146
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
147 added = True
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
148 while added:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
149 added = False
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
150 for f in ff:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
151 lev = f.count(".")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
152 if lev == 0: continue
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
153 if lev not in dn: dn[lev] = [f]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
154 else: dn[lev].append(f)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
155 for fn in sorted(dn,reverse=True):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
156 for f in dn[fn]:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
157 fc = ".".join(f.split('.')[:-1])
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
158 if fc not in ff:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
159 ab_all = [ff[fg] for fg in ff if (fg.count(".") == 0 and fg == fc) or (fg.count(".") > 0 and fc == ".".join(fg.split('.')[:-1]))]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
160 ab =[]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
161 for l in [f for f in zip(*ab_all)]:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
162 ab.append(sum([float(ll) for ll in l]))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
163 ff[fc] = ab
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
164 added = True
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
165 if added:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
166 break
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
167
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
168 return ff
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
169
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
170
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
171 def add_missing_levels(ff):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
172 if sum( [f.count(".") for f in ff] ) < 1: return ff
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
173
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
174 clades2leaves = {}
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
175 for f in ff:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
176 fs = f.split(".")
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
177 if len(fs) < 2:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
178 continue
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
179 for l in range(len(fs)):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
180 n = ".".join( fs[:l] )
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
181 if n in clades2leaves:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
182 clades2leaves[n].append( f )
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
183 else:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
184 clades2leaves[n] = [f]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
185 for k,v in clades2leaves.items():
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
186 if k and k not in ff:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
187 ff[k] = [sum(a) for a in zip(*[[float(fn) for fn in ff[vv]] for vv in v])]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
188 return ff
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
189
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
190
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
191
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
192 def modify_feature_names(fn):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
193 ret = fn
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
194
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
195 for v in [' ',r'\$',r'\@',r'#',r'%',r'\^',r'\&',r'\*',r'\"',r'\'']:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
196 ret = [re.sub(v,"",f) for f in ret]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
197 for v in ["/",r'\(',r'\)',r'-',r'\+',r'=',r'{',r'}',r'\[',r'\]',
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
198 r',',r'\.',r';',r':',r'\?',r'\<',r'\>',r'\.',r'\,']:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
199 ret = [re.sub(v,"_",f) for f in ret]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
200 for v in ["\|"]:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
201 ret = [re.sub(v,".",f) for f in ret]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
202
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
203 ret2 = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
204 for r in ret:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
205 if r[0] in ['0','1','2','3','4','5','6','7','8','9']:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
206 ret2.append("f_"+r)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
207 else: ret2.append(r)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
208
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
209 return ret2
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
210
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
211
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
212 def rename_same_subcl(cl,subcl):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
213 toc = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
214 for sc in set(subcl):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
215 if len(set([cl[i] for i in range(len(subcl)) if sc == subcl[i]])) > 1:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
216 toc.append(sc)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
217 new_subcl = []
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
218 for i,sc in enumerate(subcl):
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
219 if sc in toc: new_subcl.append(cl[i]+"_"+sc)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
220 else: new_subcl.append(sc)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
221 return new_subcl
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
222
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
223 if __name__ == '__main__':
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
224 params = read_params(sys.argv)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
225
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
226 if type(params['subclass']) is int and int(params['subclass']) < 1:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
227 params['subclass'] = None
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
228 if type(params['subject']) is int and int(params['subject']) < 1:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
229 params['subject'] = None
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
230 data = read_input_file(sys.argv[1])
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
231
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
232 if params['feats_dir'] == "c":
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
233 data = transpose(data)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
234
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
235 ncl = 1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
236 if not params['subclass'] is None: ncl += 1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
237 if not params['subject'] is None: ncl += 1
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
238
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
239 first_line = zip(*data)[0]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
240
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
241 first_line = modify_feature_names(list(first_line))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
242
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
243 data = zip( first_line,
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
244 *sort_by_cl(zip(*data)[1:],
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
245 ncl,
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
246 params['class']-1,
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
247 params['subclass']-1 if not params['subclass'] is None else None,
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
248 params['subject']-1 if not params['subject'] is None else None))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
249 # data.insert(0,first_line)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
250 # data = remove_missing(data,params['missing_p'])
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
251 cls = {}
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
252
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
253 cls_i = [('class',params['class']-1)]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
254 if params['subclass'] > 0: cls_i.append(('subclass',params['subclass']-1))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
255 if params['subject'] > 0: cls_i.append(('subject',params['subject']-1))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
256 cls_i.sort(lambda x, y: -cmp(x[1],y[1]))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
257 for v in cls_i: cls[v[0]] = data.pop(v[1])[1:]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
258 if not params['subclass'] > 0: cls['subclass'] = [str(cl)+"_subcl" for cl in cls['class']]
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
259
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
260 cls['subclass'] = rename_same_subcl(cls['class'],cls['subclass'])
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
261 # if 'subclass' in cls.keys(): cls = group_small_subclasses(cls,params['subcl_min_card'])
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
262 class_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values()))
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
263
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
264 feats = dict([(d[0],d[1:]) for d in data])
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
265
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
266 feats = add_missing_levels(feats)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
267
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
268 feats = numerical_values(feats,params['norm_v'])
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
269 out = {}
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
270 out['feats'] = feats
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
271 out['norm'] = params['norm_v']
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
272 out['cls'] = cls
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
273 out['class_sl'] = class_sl
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
274 out['subclass_sl'] = subclass_sl
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
275 out['class_hierarchy'] = class_hierarchy
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
276
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
277 if params['output_table']:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
278 with open( params['output_table'], "w") as outf:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
279 if 'class' in cls: outf.write( "\t".join(list(["class"])+list(cls['class'])) + "\n" )
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
280 if 'subclass' in cls: outf.write( "\t".join(list(["subclass"])+list(cls['subclass'])) + "\n" )
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
281 if 'subject' in cls: outf.write( "\t".join(list(["subject"])+list(cls['subject'])) + "\n" )
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
282 for k,v in out['feats'].items(): outf.write( "\t".join([k]+[str(vv) for vv in v]) + "\n" )
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
283
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
284 with open(params['output_file'], 'wb') as back_file:
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
285 pickle.dump(out,back_file)
47ac77f2fe68 First version of lefse in this toolshed
george-weingart
parents:
diff changeset
286