0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 from argparse import ArgumentParser
|
|
4 from colorsys import hsv_to_rgb
|
|
5 from math import log
|
|
6 from StringIO import StringIO
|
|
7 from re import compile
|
|
8 from hclust2.hclust2 import DataMatrix
|
|
9
|
|
10
|
|
11 __author__ = 'Francesco Asnicar'
|
|
12 __email__ = "francesco.asnicar@gmail.com"
|
28
|
13 __version__ = '0.17'
|
|
14 __date__ = '21th August 2014'
|
0
|
15
|
|
16
|
|
17 def scale_color((h, s, v), factor=1.):
|
28
|
18 """
|
|
19 Takes as input a tuple that represents a color in HSV format, and optionally a scale factor.
|
|
20 Return an RGB string that is the converted HSV color, scaled by the given factor.
|
|
21 """
|
|
22 if (h < 0.) or (h > 360.):
|
|
23 raise Exception('[scale_color()] Hue value out of range (0, 360): ' + str(h))
|
|
24
|
|
25 if (s < 0.) or (s > 100.):
|
|
26 raise Exception('[scale_color()] Saturation value out of range (0, 100): ' + str(s))
|
0
|
27
|
28
|
28 if (v < 0.) or (v > 100.):
|
|
29 raise Exception('[scale_color()] Value value out of range (0, 100): ' + str(v))
|
0
|
30
|
28
|
31 if (factor < 0.) or (factor > 1.):
|
|
32 raise Exception('[scale_color()] Factor value out of range (0.0, 1.0): ' + str(factor))
|
0
|
33
|
28
|
34 v *= factor
|
|
35 r, g, b = hsv_to_rgb(h/360., s/100., v/100.)
|
0
|
36
|
28
|
37 return '#{0:02x}{1:02x}{2:02x}'.format(int(round(r*255.)), int(round(g*255.)), int(round(b*255.)))
|
0
|
38
|
|
39
|
|
40 def read_params():
|
28
|
41 """
|
|
42 Parse the input parameters, performing some validity check.
|
|
43 Return the parsed arguments.
|
|
44 """
|
|
45 parser = ArgumentParser(description="export2graphlan.py (ver. " + __version__ +
|
|
46 " of " + __date__ + "). Convert MetaPhlAn, LEfSe, and/or HUMAnN output to GraPhlAn input format. Authors: "
|
|
47 + __author__ + " (" + __email__ + ")")
|
0
|
48
|
28
|
49 # input parameters group
|
|
50 group = parser.add_argument_group(title='input parameters',
|
|
51 description="You need to provide at least one of the two arguments")
|
|
52 group.add_argument('-i', '--lefse_input',
|
|
53 type=str,
|
|
54 required=False,
|
|
55 help="LEfSe input data. A file that can be given to LEfSe for biomarkers analysis. It can be the result of a "
|
|
56 "MetaPhlAn or HUMAnN analysis")
|
|
57 group.add_argument('-o', '--lefse_output',
|
|
58 type=str,
|
|
59 required=False,
|
|
60 help="LEfSe output result data. The result of LEfSe analysis performed on the lefse_input file")
|
|
61
|
|
62 # output parameters group
|
|
63 group = parser.add_argument_group(title='output parameters')
|
|
64 group.add_argument('-t', '--tree',
|
|
65 type=str,
|
|
66 required=True,
|
|
67 help="Output filename where save the input tree for GraPhlAn")
|
|
68 group.add_argument('-a', '--annotation',
|
|
69 type=str,
|
|
70 required=True,
|
|
71 help="Output filename where save GraPhlAn annotation")
|
0
|
72
|
28
|
73 # annotations
|
|
74 parser.add_argument('--annotations',
|
|
75 default=None,
|
|
76 type=str,
|
|
77 required=False,
|
|
78 help="List which levels should be annotated in the tree. Use a comma separate values form, e.g., "
|
|
79 "--annotation_levels 1,2,3. Default is None")
|
|
80 parser.add_argument('--external_annotations',
|
|
81 default=None,
|
|
82 type=str,
|
|
83 required=False,
|
|
84 help="List which levels should use the external legend for the annotation. Use a comma separate values form, "
|
|
85 "e.g., --annotation_levels 1,2,3. Default is None")
|
|
86 # shaded background
|
|
87 parser.add_argument('--background_levels',
|
|
88 default=None,
|
|
89 type=str,
|
|
90 required=False,
|
|
91 help="List which levels should be highlight with a shaded background. Use a comma separate values form, e.g., "
|
|
92 "--background_levels 1,2,3. Default is None")
|
|
93 parser.add_argument('--background_clades',
|
|
94 default=None,
|
|
95 type=str,
|
|
96 required=False,
|
|
97 help="Specify the clades that should be highlight with a shaded background. Use a comma separate values form "
|
|
98 "and surround the string with \" if there are spaces. Example: --background_clades \"Bacteria.Actinobacteria, "
|
|
99 "Bacteria.Bacteroidetes.Bacteroidia, Bacteria.Firmicutes.Clostridia.Clostridiales\". Default is None")
|
|
100 parser.add_argument('--background_colors',
|
|
101 default=None,
|
|
102 type=str,
|
|
103 required=False,
|
|
104 help="Set the color to use for the shaded background. Colors can be either in RGB or HSV (using a semi-colon to "
|
|
105 "separate values, surrounded with ()) format. Use a comma separate values form and surround the string with "
|
|
106 "\" if it contains spaces. Example: --background_colors \"#29cc36, (150; 100; 100), (280; 80; 88)\". Default "
|
|
107 "is None")
|
|
108 # title
|
|
109 parser.add_argument('--title',
|
|
110 type=str,
|
|
111 required=False,
|
|
112 help="If specified set the title of the GraPhlAn plot. Surround the string with \" if it contains spaces, e.g., "
|
|
113 "--title \"Title example\"")
|
|
114 # title font size
|
|
115 parser.add_argument('--title_font_size',
|
|
116 default=15,
|
|
117 type=int,
|
|
118 required=False,
|
|
119 help="Set the title font size. Default is 15")
|
|
120 # clade size
|
|
121 parser.add_argument('--def_clade_size',
|
|
122 default=10.,
|
|
123 type=float,
|
|
124 required=False,
|
|
125 help="Set a default size for clades that are not found as biomarkers by LEfSe. Default is 10")
|
|
126 parser.add_argument('--min_clade_size',
|
|
127 default=20.,
|
|
128 type=float,
|
|
129 required=False,
|
|
130 help="Set the minimum value of clades that are biomarkers. Default is 20")
|
|
131 parser.add_argument('--max_clade_size',
|
|
132 default=200.,
|
|
133 type=float,
|
|
134 required=False,
|
|
135 help="Set the maximum value of clades that are biomarkers. Default is 200")
|
|
136 # font size
|
|
137 parser.add_argument('--def_font_size',
|
|
138 default=10,
|
|
139 type=int,
|
|
140 required=False,
|
|
141 help="Set a default font size. Default is 10")
|
|
142 parser.add_argument('--min_font_size',
|
|
143 default=8,
|
|
144 type=int,
|
|
145 required=False,
|
|
146 help="Set the minimum font size to use. Default is 8")
|
|
147 parser.add_argument('--max_font_size',
|
|
148 default=12,
|
|
149 type=int,
|
|
150 required=False,
|
|
151 help="Set the maximum font size. Default is 12")
|
|
152 # legend font size
|
|
153 parser.add_argument('--annotation_legend_font_size',
|
|
154 default=10,
|
|
155 type=int,
|
|
156 required=False,
|
|
157 help="Set the font size for the annotation legend. Default is 10")
|
|
158 # abundance threshold
|
|
159 parser.add_argument('--abundance_threshold',
|
|
160 default=20.,
|
|
161 type=float,
|
|
162 required=False,
|
|
163 help="Set the minimun abundace value for a clade to be annotated. Default is 20.0")
|
|
164 # ONLY lefse_input provided
|
|
165 parser.add_argument('--most_abundant',
|
|
166 default=10,
|
|
167 type=int,
|
|
168 required=False,
|
|
169 help="When only lefse_input is provided, you can specify how many clades highlight. Since the biomarkers are "
|
|
170 "missing, they will be chosen from the most abundant. Default is 10")
|
|
171 parser.add_argument('--least_biomarkers',
|
|
172 default=3,
|
|
173 type=int,
|
|
174 required=False,
|
|
175 help="When only lefse_input is provided, you can specify the minimum number of biomarkers to extract. The "
|
|
176 "taxonomy is parsed, and the level is choosen in order to have at least the specified number of biomarkers. "
|
|
177 "Default is 3")
|
|
178 # decide to keep the OTU id or to merger at the above taxonomic level
|
|
179 parser.add_argument('--discard_otus',
|
|
180 default=True,
|
|
181 action='store_false',
|
|
182 help="If specified the OTU ids will be discarde from the taxonmy. Default is True, i.e. keep OTUs IDs in taxonomy")
|
|
183 # decide to keep the OTU id or to merger at the above taxonomic level
|
|
184 parser.add_argument('--internal_levels',
|
|
185 default=False,
|
|
186 action='store_true',
|
|
187 help="If specified sum-up from leaf to root the abundances values. Default is False, i.e. do not sum-up abundances "
|
|
188 "on the internal nodes")
|
0
|
189
|
28
|
190 DataMatrix.input_parameters(parser)
|
|
191 args = parser.parse_args()
|
0
|
192
|
28
|
193 # check if at least one of the input params is given
|
|
194 if (not args.lefse_input) and (not args.lefse_output):
|
|
195 raise Exception("[read_params()] You must provide at least one of the two input parameters: ")
|
0
|
196
|
28
|
197 # check that min_clade_size is less than max_clade_size
|
|
198 if args.min_clade_size > args.max_clade_size:
|
|
199 print "[W] min_clade_size cannot be greater than max_clade_size, assigning their default values"
|
|
200 args.min_clade_size = 20.
|
|
201 args.max_clade_size = 200.
|
|
202
|
|
203 # check that min_font_size is less than max_font_size
|
|
204 if args.min_font_size > args.max_font_size:
|
|
205 print "[W] min_font_size cannot be greater than max_font_size, assigning their default values"
|
|
206 args.min_font_size = 8
|
|
207 args.max_font_size = 12
|
|
208
|
|
209 return args
|
0
|
210
|
|
211
|
|
212 def get_file_type(filename):
|
28
|
213 """
|
|
214 Return the extension (if any) of the ``filename`` in lower case.
|
|
215 """
|
|
216 return filename[filename.rfind('.')+1:].lower()
|
0
|
217
|
|
218
|
28
|
219 def parse_biom(filename, keep_otus=True, internal_levels=False):
|
|
220 """
|
|
221 Load a biom table and extract the taxonomy (from metadata), removing the unuseful header.
|
|
222 Return the input biom in tab-separated format.
|
|
223 """
|
|
224 from biom import load_table # avoid to ask for the BIOM library if there is no biom file
|
0
|
225
|
28
|
226 biom_table = load_table(filename)
|
|
227 strs = biom_table.delimited_self(header_value='TAXA', header_key='taxonomy')
|
|
228 lst1 = [str(s) for s in strs.split('\n')[1:]] # skip the "# Constructed from biom file" entry
|
|
229 biom_file = []
|
|
230 out = [lst1[0]] # save the header
|
|
231 pre_taxa = compile(".__")
|
|
232 classs = compile("\(class\)")
|
|
233
|
|
234 # consistency check
|
|
235 i = 0
|
|
236 while i < (len(lst1)-1):
|
|
237 if len([s for s in lst1[i].split('\t')]) != len([s for s in lst1[i+1].split('\t')]):
|
|
238 raise Exception('[parse_biom()] It seems that taxonomic metadata are missing, maybe is the wrong biom file?')
|
|
239
|
|
240 i += 1
|
0
|
241
|
28
|
242 for l in lst1[1:]:
|
|
243 otu = None
|
|
244 lst = [float(s.strip()) for s in l.split('\t')[1:-1]]
|
0
|
245
|
28
|
246 if keep_otus:
|
|
247 otu = l.split('\t')[0]
|
0
|
248
|
28
|
249 # Clean and move taxa in first place
|
|
250 taxa = '.'.join([s.strip().replace('[', '').replace('u\'', '').replace(']', '').replace(' ', '').replace('\'', '')
|
|
251 for s in l.split('\t')[-1].split(',')])
|
|
252 taxa = pre_taxa.sub('', taxa) # remove '{k|p|o|g|s}__'
|
|
253 taxa = classs.sub('', taxa) # remove '(class)'
|
|
254 taxa = taxa.rstrip('.') # remove trailing dots
|
|
255
|
|
256 if otu:
|
|
257 taxa = taxa + '.' + otu
|
0
|
258
|
28
|
259 biom_file.append([taxa] + lst)
|
0
|
260
|
28
|
261 # merge such rows that have the same taxa
|
|
262 i = 1
|
|
263 dic = {}
|
|
264
|
|
265 for l in biom_file[i:]:
|
|
266 if l[0] not in dic:
|
|
267 dic[l[0]] = l[1:]
|
0
|
268
|
28
|
269 for k in biom_file[i+1:]:
|
|
270 if l[0] == k[0]:
|
|
271 lst = []
|
|
272 lstdic = dic[l[0]]
|
|
273 j = 1
|
|
274 while j < len(lstdic):
|
|
275 lst.append(float(lstdic[j]) + float(k[j]))
|
|
276 j += 1
|
0
|
277
|
28
|
278 dic[l[0]] = lst
|
|
279 i += 1
|
|
280
|
|
281 feats = dict(dic)
|
0
|
282
|
28
|
283 if internal_levels:
|
|
284 feats = add_missing_levels(feats)
|
|
285
|
|
286 for k in feats:
|
|
287 out.append('\t'.join([str(s) for s in [k] + feats[k]]))
|
|
288
|
|
289 return '\n'.join(out)
|
0
|
290
|
28
|
291
|
|
292 def add_missing_levels(ff, summ=True):
|
|
293 """
|
|
294 Sum-up the internal abundances from leaf to root
|
|
295 """
|
|
296 if sum([f.count(".") for f in ff]) < 1:
|
|
297 return ff
|
0
|
298
|
28
|
299 clades2leaves = {}
|
|
300 for f in ff:
|
|
301 fs = f.split(".")
|
|
302
|
|
303 if len(fs) < 2:
|
|
304 continue
|
0
|
305
|
28
|
306 for l in range(1, len(fs)+1):
|
|
307 n = ".".join(fs[:l])
|
0
|
308
|
28
|
309 if n in clades2leaves:
|
|
310 clades2leaves[n].append(f)
|
|
311 else:
|
|
312 clades2leaves[n] = [f]
|
0
|
313
|
28
|
314 ret = {}
|
|
315 for k in clades2leaves:
|
|
316 if summ:
|
|
317 ret[k] = [sum([sum(ff[e]) for e in clades2leaves[k]])]
|
|
318 else:
|
|
319 lst = []
|
|
320 for e in clades2leaves[k]:
|
|
321 if not lst:
|
|
322 for i in ff[e]:
|
|
323 lst.append(i)
|
|
324 else:
|
|
325 lst1 = []
|
|
326 i = 0
|
|
327 while i < len(lst):
|
|
328 lst1.append(lst[i] + ff[e][i])
|
|
329 i += 1
|
|
330 lst = lst1
|
0
|
331
|
28
|
332 ret[k] = lst
|
0
|
333
|
28
|
334 return ret
|
0
|
335
|
|
336
|
|
337 def get_most_abundant(abundances, xxx):
|
28
|
338 """
|
|
339 Sort by the abundance level all the taxonomy that represent at least two levels.
|
|
340 Return the first ``xxx`` most abundant.
|
|
341 """
|
|
342 abundant = []
|
0
|
343
|
28
|
344 for a in abundances:
|
|
345 if a.count('|') > 0:
|
|
346 abundant.append((float(abundances[a]), a.replace('|', '.')))
|
|
347 elif a.count('.') > 0:
|
|
348 abundant.append((float(abundances[a]), a))
|
0
|
349
|
28
|
350 abundant.sort(reverse=True)
|
|
351 return abundant[:xxx]
|
0
|
352
|
|
353
|
|
354 def get_biomarkes(abundant, xxx):
|
28
|
355 """
|
|
356 Split the taxonomy and then look, level by level, when there are at least ``xxx`` distinct branches.
|
|
357 Return the set of branches as biomarkers to highlight.
|
|
358 """
|
|
359 cc = []
|
|
360 bk = set()
|
|
361 lvl = 0
|
0
|
362
|
28
|
363 for _, t in abundant:
|
|
364 cc.append(t.split('.'))
|
0
|
365
|
28
|
366 while lvl < len(max(cc)):
|
|
367 bk = set()
|
0
|
368
|
28
|
369 for c in cc:
|
|
370 if lvl < len(c):
|
|
371 bk |= set([c[lvl]])
|
0
|
372
|
28
|
373 if len(bk) >= xxx:
|
|
374 break
|
0
|
375
|
28
|
376 lvl += 1
|
0
|
377
|
28
|
378 return bk
|
0
|
379
|
|
380 def scale_clade_size(minn, maxx, abu, max_abu):
|
28
|
381 """
|
|
382 Return the value of ``abu`` scaled to ``max_abu`` logarithmically, and then map from ``minn`` to ``maxx``.
|
|
383 """
|
|
384 return minn + maxx*log(1. + (abu/max_abu))
|
0
|
385
|
|
386
|
|
387 def main():
|
28
|
388 """
|
|
389 """
|
|
390 colors = [(245., 90., 100.), (125., 80., 80.), (0., 80., 100.), (195., 100., 100.),
|
|
391 (150., 100., 100.), (55., 100., 100.), (280., 80., 88.)] # HSV format
|
|
392 args = read_params()
|
|
393 lefse_input = None
|
|
394 lefse_output = {}
|
|
395 color = {}
|
|
396 biomarkers = set()
|
|
397 taxa = []
|
|
398 abundances = {}
|
|
399 max_abundances = None
|
|
400 max_effect_size = None
|
|
401 max_log_effect_size = None
|
|
402 background_list = []
|
|
403 background_clades = []
|
|
404 background_colors = {}
|
|
405 annotations_list = []
|
|
406 external_annotations_list = []
|
|
407 lin = False
|
|
408 lout = False
|
0
|
409
|
28
|
410 # get the levels that should be shaded
|
|
411 if args.background_levels:
|
|
412 background_list = [int(i.strip()) for i in args.background_levels.strip().split(',')]
|
0
|
413
|
28
|
414 # get the background_clades
|
|
415 if args.background_clades:
|
|
416 if get_file_type(args.background_colors) in ['txt']:
|
|
417 with open(args.background_clades, 'r') as f:
|
|
418 background_clades = [str(s.strip()) for s in f]
|
|
419 else: # it's a string in csv format
|
|
420 background_clades = [str(s.strip()) for s in args.background_clades.split(',')]
|
0
|
421
|
28
|
422 # read the set of colors to use for the background_clades
|
|
423 if args.background_colors:
|
|
424 col = []
|
0
|
425
|
28
|
426 if get_file_type(args.background_colors) in ['txt']:
|
|
427 with open(args.background_colors, 'r') as f:
|
|
428 col = [str(s.strip()) for s in f]
|
|
429 else: # it's a string in csv format
|
|
430 col = [c.strip() for c in args.background_colors.split(',')]
|
0
|
431
|
28
|
432 lst = {}
|
|
433 i = 0
|
0
|
434
|
28
|
435 for c in background_clades:
|
|
436 cc = c[:c.find('.')]
|
0
|
437
|
28
|
438 if cc not in lst:
|
|
439 background_colors[c] = col[i % len(col)]
|
|
440 lst[cc] = col[i % len(col)]
|
|
441 i += 1
|
|
442 else:
|
|
443 background_colors[c] = lst[cc]
|
0
|
444
|
28
|
445 # get the levels that will use the internal annotation
|
|
446 if args.annotations:
|
|
447 annotations_list = [int(i.strip()) for i in args.annotations.strip().split(',')]
|
0
|
448
|
28
|
449 # get the levels that will use the external legend annotation
|
|
450 if args.external_annotations:
|
|
451 external_annotations_list = [int(i.strip()) for i in args.external_annotations.strip().split(',')]
|
0
|
452
|
28
|
453 if args.lefse_input:
|
|
454 # if the lefse_input is in biom format, convert it
|
|
455 if get_file_type(args.lefse_input) in 'biom':
|
|
456 try:
|
|
457 biom = parse_biom(args.lefse_input, args.discard_otus, args.internal_levels)
|
|
458 lefse_input = DataMatrix(StringIO(biom), args)
|
|
459 except Exception as e:
|
|
460 lin = True
|
|
461 print e
|
|
462 else:
|
|
463 if args.internal_levels:
|
|
464 aaa = {}
|
|
465 header = None
|
|
466 with open(args.lefse_input, 'r') as f:
|
|
467 for r in f:
|
|
468 if header is None:
|
|
469 header = [s.strip() for s in r.split('\t')]
|
|
470 else:
|
|
471 row = r.split('\t')
|
|
472 aaa[row[0].strip().replace('|', '.')] = [float(s.strip()) for s in row[1:]]
|
0
|
473
|
28
|
474 feats = add_missing_levels(aaa, summ=False)
|
|
475 ss = '\t'.join(header)
|
|
476 ss += '\n'
|
|
477 ss += '\n'.join(['\t'.join([str(s) for s in [k] + feats[k]]) for k in feats])
|
|
478 lefse_input = DataMatrix(StringIO(ss), args)
|
|
479 else:
|
|
480 lefse_input = DataMatrix(args.lefse_input, args)
|
0
|
481
|
28
|
482 if not lin:
|
|
483 taxa = [t.replace('|', '.').strip() for t in lefse_input.get_fnames()] # build taxonomy list
|
|
484 abundances = dict(lefse_input.get_averages())
|
|
485 max_abundances = max([abundances[x] for x in abundances])
|
|
486 else: # no lefse_input provided
|
|
487 lin = True
|
0
|
488
|
28
|
489 if args.lefse_output:
|
|
490 # if the lefse_output is in biom format... I don't think it's possible!
|
|
491 if get_file_type(args.lefse_output) in 'biom':
|
|
492 lout = True
|
|
493 print "Seriously?? LEfSe output file is not expected to be in biom format!"
|
|
494 else:
|
|
495 lst = []
|
0
|
496
|
28
|
497 with open(args.lefse_output, 'r') as out_file:
|
|
498 for line in out_file:
|
|
499 t, m, bk, es, pv = line.strip().split('\t')
|
|
500 lefse_output[t] = (es, bk, m, pv)
|
0
|
501
|
28
|
502 # get distinct biomarkers
|
|
503 if bk:
|
|
504 biomarkers |= set([bk])
|
0
|
505
|
28
|
506 # get all effect size
|
|
507 if es:
|
|
508 lst.append(float(es))
|
|
509
|
|
510 max_effect_size = max(lst)
|
0
|
511
|
28
|
512 # no lefse_input file provided!
|
|
513 if (not taxa) and (not abundances): # build taxonomy list and abundaces map
|
|
514 for t in lefse_output:
|
|
515 _, _, m, _ = lefse_output[t]
|
|
516 abundances[t.replace('.', '|')] = float(m)
|
|
517
|
|
518 max_abundances = max([abundances[x] for x in abundances])
|
0
|
519
|
28
|
520 for t in lefse_output:
|
|
521 scaled = scale_clade_size(args.min_clade_size, args.max_clade_size,
|
|
522 abundances[t.replace('.', '|')], max_abundances)
|
0
|
523
|
28
|
524 if scaled >= args.abundance_threshold:
|
|
525 taxa.append(t)
|
|
526 elif not lin: # no lefse_output provided and lefse_input correctly red
|
|
527 lout = True
|
0
|
528
|
28
|
529 # find the xxx most abundant
|
|
530 abundant = get_most_abundant(abundances, args.most_abundant)
|
0
|
531
|
28
|
532 # find the taxonomy level with at least yyy distinct childs from the xxx most abundant
|
|
533 biomarkers = get_biomarkes(abundant, args.least_biomarkers)
|
0
|
534
|
28
|
535 # compose lefse_output variable
|
|
536 for _, t in abundant:
|
|
537 b = ''
|
0
|
538
|
28
|
539 for bk in biomarkers:
|
|
540 if bk in t:
|
|
541 b = bk
|
0
|
542
|
28
|
543 lefse_output[t] = (2., b, '', '')
|
0
|
544
|
28
|
545 max_effect_size = 1. # It's not gonna working
|
|
546 # no lefse_output and no lefse_input provided
|
|
547 if lin and lout:
|
|
548 print "You must provide at least one input file!"
|
|
549 exit(1)
|
0
|
550
|
28
|
551 # write the tree
|
|
552 with open(args.tree, 'w') as tree_file:
|
|
553 tree_file.write('\n'.join(taxa))
|
0
|
554
|
28
|
555 # for each biomarker assign it to a different color
|
|
556 i = 0
|
0
|
557
|
28
|
558 for bk in biomarkers:
|
|
559 color[bk] = i % len(colors)
|
|
560 i += 1
|
0
|
561
|
28
|
562 # find max log abs value of effect size
|
|
563 if lefse_output:
|
|
564 lst = []
|
|
565
|
|
566 for t in lefse_output:
|
|
567 es, _, _, _ = lefse_output[t]
|
0
|
568
|
28
|
569 if es:
|
|
570 lst.append(abs(log(float(es) / max_effect_size)))
|
|
571
|
|
572 max_log_effect_size = max(lst)
|
0
|
573
|
28
|
574 # write the annotation
|
|
575 try:
|
|
576 with open(args.annotation, 'w') as annot_file:
|
|
577 # set the title
|
|
578 if args.title:
|
|
579 annot_file.write('\n'.join(['\t'.join(['title', args.title]),
|
|
580 '\t'.join(['title_font_size', str(args.title_font_size)]), '\n']))
|
0
|
581
|
28
|
582 # write some basic customizations
|
|
583 annot_file.write('\n'.join(['\t'.join(['clade_separation', '0.5']),
|
|
584 '\t'.join(['branch_bracket_depth', '0.8']),
|
|
585 '\t'.join(['branch_bracket_width', '0.2']),
|
|
586 '\t'.join(['annotation_legend_font_size', str(args.annotation_legend_font_size)]),
|
|
587 '\t'.join(['class_legend_font_size', '10']),
|
|
588 '\t'.join(['class_legend_marker_size', '1.5']), '\n']))
|
0
|
589
|
28
|
590 # write the biomarkers' legend
|
|
591 for bk in biomarkers:
|
|
592 biom = bk.replace('_', ' ').upper()
|
|
593 rgb = scale_color(colors[color[bk]])
|
|
594 annot_file.write('\n'.join(['\t'.join([biom, 'annotation', biom]),
|
|
595 '\t'.join([biom, 'clade_marker_color', rgb]),
|
|
596 '\t'.join([biom, 'clade_marker_size', '40']), '\n']))
|
0
|
597
|
28
|
598 # write the annotation for the tree
|
|
599 for taxonomy in taxa:
|
|
600 level = taxonomy.count('.') + 1 # which level is this taxonomy?
|
|
601 clean_taxonomy = taxonomy[taxonomy.rfind('.') + 1:] # retrieve the last level in taxonomy
|
|
602 cleanest_taxonomy = clean_taxonomy.replace('_', ' ') # substitute '_' with ' '
|
|
603 scaled = args.def_clade_size
|
0
|
604
|
28
|
605 # scaled the size of the clade by the average abundance
|
|
606 if (taxonomy in abundances) or (taxonomy.replace('.', '|') in abundances):
|
|
607 try:
|
|
608 abu = abundances[taxonomy.replace('.', '|')]
|
|
609 except:
|
|
610 abu = abundances[taxonomy]
|
|
611
|
|
612 scaled = scale_clade_size(args.min_clade_size, args.max_clade_size, abu, max_abundances)
|
0
|
613
|
28
|
614 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_size', str(scaled)]), '\n']))
|
|
615
|
|
616 # put a bakcground annotation to the levels specified by the user
|
|
617 shaded_background = []
|
0
|
618
|
28
|
619 for l in background_list:
|
|
620 if level >= l:
|
|
621 lst = [s.strip() for s in taxonomy.strip().split('.')]
|
|
622 t = '.'.join(lst[:l])
|
0
|
623
|
28
|
624 if t not in shaded_background:
|
|
625 shaded_background.append(t)
|
0
|
626
|
28
|
627 font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / l)
|
|
628
|
|
629 annot_file.write('\n'.join(['\t'.join([t, 'annotation_background_color', args.background_color]),
|
|
630 '\t'.join([t, 'annotation', t.replace('_', ' ')]),
|
|
631 '\t'.join([t, 'annotation_font_size', str(font_size)]), '\n']))
|
0
|
632
|
28
|
633 # put a bakcground annotation to the clades specified by the user
|
|
634 for c in background_colors:
|
|
635 bg_color = background_colors[c]
|
0
|
636
|
28
|
637 if not bg_color.startswith('#'):
|
|
638 bg_color = bg_color.replace('(', '').replace(')', '')
|
|
639 h, s, v = bg_color.split(';')
|
|
640 bg_color = scale_color((float(h.strip()) , float(s.strip()), float(v.strip())))
|
0
|
641
|
28
|
642 # check if the taxonomy has more than one level
|
|
643 lvls = [str(cc.strip()) for cc in c.split('.')]
|
|
644 done_clades = []
|
0
|
645
|
28
|
646 for l in lvls:
|
|
647 if (l in taxonomy) and (l not in done_clades):
|
|
648 lvl = taxonomy[:taxonomy.index(l)].count('.') + 1
|
|
649 font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / lvl)
|
|
650
|
|
651 annot_file.write('\n'.join(['\t'.join([l, 'annotation_background_color', bg_color]),
|
|
652 '\t'.join([l, 'annotation', l.replace('_', ' ')]),
|
|
653 '\t'.join([l, 'annotation_font_size', str(font_size)]), '\n']))
|
0
|
654
|
28
|
655 done_clades.append(l)
|
0
|
656
|
28
|
657 if lefse_output:
|
|
658 if taxonomy in lefse_output:
|
|
659 es, bk, _, _ = lefse_output[taxonomy]
|
0
|
660
|
28
|
661 # if it is a biomarker then color and label it!
|
|
662 if bk:
|
|
663 fac = abs(log(float(es) / max_effect_size)) / max_log_effect_size
|
0
|
664
|
28
|
665 try:
|
|
666 rgbs = scale_color(colors[color[bk]], fac)
|
|
667 except Exception as e:
|
|
668 print e
|
|
669 print ' '.join(["[W] Assign to", taxonomy, "the default color:", colors[color[bk]]])
|
|
670 rgbs = colors[color[bk]]
|
|
671
|
|
672 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_color', rgbs]), '\n']))
|
0
|
673
|
28
|
674 # write the annotation only if the abundance is above a given threshold and it is either
|
|
675 # internal or external annotation lists
|
|
676 if (scaled >= args.abundance_threshold) and \
|
|
677 ((level in annotations_list) or (level in external_annotations_list)):
|
|
678 font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / level)
|
|
679 annotation = cleanest_taxonomy if level in annotations_list else '*:' + cleanest_taxonomy
|
0
|
680
|
28
|
681 annot_file.write('\n'.join(['\t'.join([clean_taxonomy, 'annotation_background_color', rgbs]),
|
|
682 '\t'.join([clean_taxonomy, 'annotation', annotation]),
|
|
683 '\t'.join([clean_taxonomy, 'annotation_font_size', str(font_size)]), '\n']))
|
|
684 except Exception as e:
|
|
685 print e
|
0
|
686
|
|
687
|
28
|
688 if __name__ == '__main__':
|
|
689 main()
|
|
690
|