comparison export2graphlan.py @ 0:cac6247cb1d3 draft

graphlan_import
author george-weingart
date Tue, 26 Aug 2014 14:51:29 -0400
parents
children 82fb838d02dc
comparison
equal deleted inserted replaced
-1:000000000000 0:cac6247cb1d3
1 #!/usr/bin/env python
2
3 from argparse import ArgumentParser
4 from colorsys import hsv_to_rgb
5 from math import log
6 from StringIO import StringIO
7 from re import compile
8 from hclust2.hclust2 import DataMatrix
9
10
11 __author__ = 'Francesco Asnicar'
12 __email__ = "francesco.asnicar@gmail.com"
13 __version__ = '0.13'
14 __date__ = '29th July 2014'
15
16
17 def scale_color((h, s, v), factor=1.):
18 """
19 Takes as input a tuple that represents a color in HSV format, and optionally a scale factor.
20 Return an RGB string that is the converted HSV color, scaled by the given factor.
21 """
22 if (h < 0.) or (h > 360.):
23 raise Exception('[scale_color()] Hue value out of range (0, 360): ' + str(h))
24
25 if (s < 0.) or (s > 100.):
26 raise Exception('[scale_color()] Saturation value out of range (0, 100): ' + str(s))
27
28 if (v < 0.) or (v > 100.):
29 raise Exception('[scale_color()] Value value out of range (0, 100): ' + str(v))
30
31 if (factor < 0.) or (factor > 1.):
32 raise Exception('[scale_color()] Factor value out of range (0.0, 1.0): ' + str(factor))
33
34 v *= factor
35 r, g, b = hsv_to_rgb(h/360., s/100., v/100.)
36
37 return '#{0:02x}{1:02x}{2:02x}'.format(int(round(r*255.)), int(round(g*255.)), int(round(b*255.)))
38
39
40 def read_params():
41 """
42 Parse the input parameters, performing some validity check.
43 Return the parsed arguments.
44 """
45 parser = ArgumentParser(description="export2graphlan.py (ver. " + __version__ +
46 " of " + __date__ + "). Convert MetaPhlAn, LEfSe, and/or HUMAnN output to GraPhlAn input format. Authors: "
47 + __author__ + " (" + __email__ + ")")
48
49 # input parameters group
50 group = parser.add_argument_group(title='input parameters',
51 description="You need to provide at least one of the two arguments")
52 group.add_argument('-i', '--lefse_input',
53 type=str,
54 required=False,
55 help="LEfSe input data")
56 group.add_argument('-o', '--lefse_output',
57 type=str,
58 required=False,
59 help="LEfSe output result data")
60
61 # output parameters group
62 group = parser.add_argument_group(title='output parameters')
63 group.add_argument('-t', '--tree',
64 type=str,
65 required=True,
66 help="Output filename where save the input tree for GraPhlAn")
67 group.add_argument('-a', '--annotation',
68 type=str,
69 required=True,
70 help="Output filename where save GraPhlAn annotation")
71
72 # annotations
73 parser.add_argument('--annotations',
74 default=None,
75 type=str,
76 required=False,
77 help="List which levels should be annotated in the tree. Use a comma separate values form, e.g., --annotation_levels 1,2,3. Default is None")
78 parser.add_argument('--external_annotations',
79 default=None,
80 type=str,
81 required=False,
82 help="List which levels should use the external legend for the annotation. Use a comma separate values form, e.g., --annotation_levels 1,2,3. Default is None")
83 # shaded background
84 parser.add_argument('--background_levels',
85 default=None,
86 type=str,
87 required=False,
88 help="List which levels should be highlight with a shaded background. Use a comma separate values form, e.g., --background_levels 1,2,3")
89 parser.add_argument('--background_clades',
90 default=None,
91 type=str,
92 required=False,
93 help="Specify the clades that should be highlight with a shaded background. Use a comma separate values form and surround the string with \" if it contains spaces. Example: --background_clades \"Bacteria.Actinobacteria, Bacteria.Bacteroidetes.Bacteroidia, Bacteria.Firmicutes.Clostridia.Clostridiales\"")
94 parser.add_argument('--background_colors',
95 default=None,
96 type=str,
97 required=False,
98 help="Set the color to use for the shaded background. Colors can be either in RGB or HSV (using a semi-colon to separate values, surrounded with ()) format. Use a comma separate values form and surround the string with \" if it contains spaces. Example: --background_colors \"#29cc36, (150; 100; 100), (280; 80; 88)\"")
99 # title
100 parser.add_argument('--title',
101 type=str,
102 required=False,
103 help="If specified set the title of the GraPhlAn plot. Surround the string with \" if it contains spaces, e.g., --title \"Title example\"")
104 # title font size
105 parser.add_argument('--title_font_size',
106 default=15,
107 type=int,
108 required=False,
109 help="Set the title font size. Default is 15")
110 # clade size
111 parser.add_argument('--def_clade_size',
112 default=10.,
113 type=float,
114 required=False,
115 help="Set a default size for clades that are not found as biomarkers by LEfSe. Default is 10")
116 parser.add_argument('--min_clade_size',
117 default=20.,
118 type=float,
119 required=False,
120 help="Set the minimum value of clades that are biomarkers. Default is 20")
121 parser.add_argument('--max_clade_size',
122 default=200.,
123 type=float,
124 required=False,
125 help="Set the maximum value of clades that are biomarkers. Default is 200")
126 # font size
127 parser.add_argument('--def_font_size',
128 default=10,
129 type=int,
130 required=False,
131 help="Set a default font size. Default is 10")
132 parser.add_argument('--min_font_size',
133 default=8,
134 type=int,
135 required=False,
136 help="Set the minimum font size to use. Default is 8")
137 parser.add_argument('--max_font_size',
138 default=12,
139 type=int,
140 required=False,
141 help="Set the maximum font size. Default is 12")
142 # legend font size
143 parser.add_argument('--annotation_legend_font_size',
144 default=10,
145 type=int,
146 required=False,
147 help="Set the font size for the annotation legend. Default is 10")
148 # abundance threshold
149 parser.add_argument('--abundance_threshold',
150 default=20.,
151 type=float,
152 required=False,
153 help="Set the minimun abundace value for a clade to be annotated. Default is 20.0")
154 # ONLY lefse_input provided
155 parser.add_argument('--most_abundant',
156 default=10,
157 type=int,
158 required=False,
159 help="When only lefse_input is provided, you can specify how many clades highlight. Since the biomarkers are missing, they will be chosen from the most abundant")
160 parser.add_argument('--least_biomarkers',
161 default=3,
162 type=int,
163 required=False,
164 help="When only lefse_input is provided, you can specify the minimum number of biomarkers to extract. The taxonomy is parsed, and the level is choosen in order to have at least the specified number of biomarkers")
165 # decide to keep the OTU id or to merger at the above taxonomic level
166 parser.add_argument('--discard_otus',
167 default=True,
168 action='store_false',
169 help="If specified the OTU ids will be discarde from the taxonmy. Default behavior keep OTU ids in taxonomy")
170
171 DataMatrix.input_parameters(parser)
172 args = parser.parse_args()
173
174 # check if at least one of the input params is given
175 if (not args.lefse_input) and (not args.lefse_output) :
176 raise Exception("[read_params()] You must provide at least one of the two input parameters: ")
177
178 # check that min_clade_size is less than max_clade_size
179 if args.min_clade_size > args.max_clade_size :
180 print "[W] min_clade_size cannot be greater than max_clade_size, assigning their default values"
181 args.min_clade_size = 20.
182 args.max_clade_size = 200.
183
184 # check that min_font_size is less than max_font_size
185 if args.min_font_size > args.max_font_size :
186 print "[W] min_font_size cannot be greater than max_font_size, assigning their default values"
187 args.min_font_size = 8
188 args.max_font_size = 12
189
190 return args
191
192
193 def get_file_type(filename):
194 """
195 Return the extension (if any) of the ``filename`` in lower case.
196 """
197 return filename[filename.rfind('.')+1:].lower()
198
199
200 def parse_biom(filename, keep_otus=True):
201 """
202 Load a biom table and extract the taxonomy (from metadata), removing the unuseful header.
203 Return the input biom in tab-separated format.
204 """
205 from biom import load_table # avoid to ask for the BIOM library if there is no biom file
206
207 biom_table = load_table(filename)
208 strs = biom_table.delimited_self(header_value='TAXA', header_key='taxonomy')
209 lst1 = [str(s) for s in strs.split('\n')[1:]] # skip the "# Constructed from biom file" entry
210 biom_file = []
211 out = [lst1[0]] # save the header
212 pre_taxa = compile(".__")
213 classs = compile("\(class\)")
214
215 # consistency check
216 i = 0
217 while i < (len(lst1)-1):
218 if len([s for s in lst1[i].split('\t')]) != len([s for s in lst1[i+1].split('\t')]) :
219 raise Exception('[parse_biom()] It seems that taxonomic metadata are missing, maybe is the wrong biom file?')
220
221 i += 1
222
223 for l in lst1[1:]:
224 otu = None
225 lst = [str(s).strip() for s in l.split('\t')[1:-1]]
226
227 if keep_otus:
228 otu = l.split('\t')[0]
229
230 # Clean an move taxa in first place
231 taxa = '.'.join([s.strip().replace('[', '').replace('u\'', '').replace(']', '').replace(' ', '').replace('\'', '') for s in l.split('\t')[-1].split(',')])
232 taxa = pre_taxa.sub('', taxa) # remove '{k|p|o|g|s}__'
233 taxa = classs.sub('', taxa) # remove '(class)'
234 taxa = taxa.rstrip('.') # remove trailing dots
235
236 if otu:
237 taxa = taxa + '.' + otu
238
239 biom_file.append([taxa] + lst)
240
241 # merge such rows that have the same taxa
242 i = 1
243 dic = {}
244
245 for l in biom_file[i:]:
246 for k in biom_file[i+1:]:
247 if l[0] == k[0]:
248 lst = []
249 j = 1
250
251 while j < len(l):
252 lst.append(float(l[j]) + float(k[j]))
253 j += 1
254
255 if l[0] in dic:
256 lst1 = dic[l[0]]
257 j = 0
258 lst2 = []
259
260 while j < len(lst1):
261 lst2.append(float(lst1[j]) + float(lst[j]))
262 j += 1
263
264 lst = lst2
265
266 dic[l[0]] = lst
267 # if not in dic, add it!
268 if l[0] not in dic:
269 dic[l[0]] = l[1:]
270
271 i += 1
272
273 for k in dic:
274 out.append('\t'.join([str(s) for s in [k] + dic[k]]))
275
276 return '\n'.join(out)
277
278
279 def get_most_abundant(abundances, xxx):
280 """
281 Sort by the abundance level all the taxonomy that represent at least two levels.
282 Return the first ``xxx`` most abundant.
283 """
284 abundant = []
285
286 for a in abundances :
287 if a.count('|') > 0:
288 abundant.append((float(abundances[a]), a.replace('|', '.')))
289 elif a.count('.') > 0:
290 abundant.append((float(abundances[a]), a))
291
292 abundant.sort(reverse=True)
293 return abundant[:xxx]
294
295
296 def get_biomarkes(abundant, xxx):
297 """
298 Split the taxonomy and then look, level by level, when there are at least ``xxx`` distinct branches.
299 Return the set of branches as biomarkers to highlight.
300 """
301 cc = []
302 bk = set()
303 lvl = 0
304
305 for _, t in abundant:
306 cc.append(t.split('.'))
307
308 while lvl < len(max(cc)):
309 bk = set()
310
311 for c in cc:
312 if lvl < len(c):
313 bk |= set([c[lvl]])
314
315 if len(bk) >= xxx:
316 break
317
318 lvl += 1
319
320 return bk
321
322 def scale_clade_size(minn, maxx, abu, max_abu):
323 """
324 Return the value of ``abu`` scaled to ``max_abu`` logarithmically, and then map from ``minn`` to ``maxx``.
325 """
326 return minn + maxx*log(1. + (abu/max_abu))
327
328
329 def main():
330 """
331 """
332 colors = [(245., 90., 100.), (125., 80., 80.), (0., 80., 100.), (195., 100., 100.), (150., 100., 100.), (55., 100., 100.), (280., 80., 88.)] # HSV format
333 args = read_params()
334 lefse_input = None
335 lefse_output = {}
336 color = {}
337 biomarkers = set()
338 taxa = []
339 abundances = {}
340 max_abundances = None
341 max_effect_size = None
342 max_log_effect_size = None
343 background_list = []
344 background_clades = []
345 background_colors = {}
346 annotations_list = []
347 external_annotations_list = []
348 lin = False
349 lout = False
350
351 # get the levels that should be shaded
352 if args.background_levels :
353 background_list = [int(i.strip()) for i in args.background_levels.strip().split(',')]
354
355 # get the background_clades
356 if args.background_clades :
357 if get_file_type(args.background_colors) in ['txt'] :
358 with open(args.background_clades, 'r') as f:
359 background_clades = [str(s.strip()) for s in f]
360 else : # it's a string in csv format
361 background_clades = [str(s.strip()) for s in args.background_clades.split(',')]
362
363 # read the set of colors to use for the background_clades
364 if args.background_colors :
365 col = []
366
367 if get_file_type(args.background_colors) in ['txt'] :
368 with open(args.background_colors, 'r') as f :
369 col = [str(s.strip()) for s in f]
370 else : # it's a string in csv format
371 col = [c.strip() for c in args.background_colors.split(',')]
372
373 lst = {}
374 i = 0
375
376 for c in background_clades :
377 cc = c[:c.find('.')]
378
379 if cc not in lst :
380 background_colors[c] = col[i % len(col)]
381 lst[cc] = col[i % len(col)]
382 i += 1
383 else :
384 background_colors[c] = lst[cc]
385
386 # get the levels that will use the internal annotation
387 if args.annotations :
388 annotations_list = [int(i.strip()) for i in args.annotations.strip().split(',')]
389
390 # get the levels that will use the external legend annotation
391 if args.external_annotations :
392 external_annotations_list = [int(i.strip()) for i in args.external_annotations.strip().split(',')]
393
394 if args.lefse_input :
395 # if the lefse_input is in biom format, convert it
396 if get_file_type(args.lefse_input) in 'biom' :
397 try :
398 biom = parse_biom(args.lefse_input, args.discard_otus)
399 lefse_input = DataMatrix(StringIO(biom), args)
400 except Exception as e :
401 lin = True
402 print e
403 else :
404 lefse_input = DataMatrix(args.lefse_input, args)
405
406 if not lin :
407 taxa = [t.replace('|', '.') for t in lefse_input.get_fnames()] # build taxonomy list
408 abundances = dict(lefse_input.get_averages())
409 max_abundances = max([abundances[x] for x in abundances])
410 else : # no lefse_input provided
411 lin = True
412
413 if args.lefse_output :
414 # if the lefse_output is in biom format... I don't think it's possible!
415 if get_file_type(args.lefse_output) in 'biom' :
416 lout = True
417 print "Seriously?? LEfSe output file is not expected to be in biom format!"
418 else :
419 lst = []
420
421 with open(args.lefse_output, 'r') as out_file :
422 for line in out_file :
423 t, m, bk, es, pv = line.strip().split('\t')
424 lefse_output[t] = (es, bk, m, pv)
425
426 # get distinct biomarkers
427 if bk :
428 biomarkers |= set([bk])
429
430 # get all effect size
431 if es :
432 lst.append(float(es))
433
434 max_effect_size = max(lst)
435
436 # no lefse_input file provided!
437 if (not taxa) and (not abundances) : # build taxonomy list and abundaces map
438 for t in lefse_output :
439 _, _, m, _ = lefse_output[t]
440 abundances[t.replace('.', '|')] = float(m)
441
442 max_abundances = max([abundances[x] for x in abundances])
443
444 for t in lefse_output :
445 scaled = scale_clade_size(args.min_clade_size, args.max_clade_size, abundances[t.replace('.', '|')], max_abundances)
446
447 if scaled >= args.abundance_threshold :
448 taxa.append(t)
449 elif not lin : # no lefse_output provided and lefse_input correctly red
450 lout = True
451
452 # find the xxx most abundant
453 abundant = get_most_abundant(abundances, args.most_abundant)
454
455 # find the taxonomy level with at least yyy distinct childs from the xxx most abundant
456 biomarkers = get_biomarkes(abundant, args.least_biomarkers)
457
458 # compose lefse_output variable
459 for _, t in abundant :
460 b = ''
461
462 for bk in biomarkers :
463 if bk in t :
464 b = bk
465
466 lefse_output[t] = (2., b, '', '')
467
468 max_effect_size = 1. # It's not gonna working
469
470
471 # no lefse_output and no lefse_input provided
472 if lin and lout :
473 print "You must provide at least one input file!"
474 exit(1)
475
476 # write the tree
477 with open(args.tree, 'w') as tree_file :
478 for taxonomy in taxa :
479 tree_file.write(''.join([taxonomy, '\n']))
480
481 # for each biomarker assign it to a different color
482 i = 0
483
484 for bk in biomarkers :
485 color[bk] = i % len(colors)
486 i += 1
487
488 # find max log abs value of effect size
489 if lefse_output :
490 lst = []
491
492 for t in lefse_output :
493 es, _, _, _ = lefse_output[t]
494
495 if es :
496 lst.append(abs(log(float(es) / max_effect_size)))
497
498 max_log_effect_size = max(lst)
499
500 # write the annotation
501 try :
502 with open(args.annotation, 'w') as annot_file :
503 # set the title
504 if args.title :
505 annot_file.write(''.join(['\t'.join(['title', args.title]), '\n']))
506 annot_file.write(''.join(['\t'.join(['title_font_size', str(args.title_font_size)]), '\n']))
507
508 # write some basic customizations
509 annot_file.write(''.join(['\t'.join(['clade_separation', '0.5']), '\n']))
510 annot_file.write(''.join(['\t'.join(['branch_bracket_depth', '0.8']), '\n']))
511 annot_file.write(''.join(['\t'.join(['branch_bracket_width', '0.2']), '\n']))
512 annot_file.write(''.join(['\t'.join(['annotation_legend_font_size', str(args.annotation_legend_font_size)]), '\n']))
513 annot_file.write(''.join(['\t'.join(['class_legend_font_size', '10']), '\n']))
514 annot_file.write(''.join(['\t'.join(['class_legend_marker_size', '1.5']), '\n']))
515
516 # write the biomarkers' legend
517 for bk in biomarkers :
518 biom = bk.replace('_', ' ').upper()
519 rgb = scale_color(colors[color[bk]])
520 annot_file.write(''.join(['\t'.join([biom, 'annotation', biom]), '\n']))
521 annot_file.write(''.join(['\t'.join([biom, 'clade_marker_color', rgb]), '\n']))
522 annot_file.write(''.join(['\t'.join([biom, 'clade_marker_size', '40']), '\n']))
523
524 done_clades = []
525
526 # write the annotation for the tree
527 for taxonomy in taxa :
528 level = taxonomy.count('.') + 1 # which level is this taxonomy?
529 clean_taxonomy = taxonomy[taxonomy.rfind('.') + 1:] # retrieve the last level in taxonomy
530 scaled = args.def_clade_size
531
532 # scaled the size of the clade by the average abundance
533 if taxonomy.replace('.', '|') in abundances :
534 scaled = scale_clade_size(args.min_clade_size, args.max_clade_size, abundances[taxonomy.replace('.', '|')], max_abundances)
535
536 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_size', str(scaled)]), '\n']))
537
538 # put a bakcground annotation to the levels specified by the user
539 shaded_background = []
540
541 for l in background_list :
542 if level >= l :
543 lst = [s.strip() for s in taxonomy.strip().split('.')]
544 t = '.'.join(lst[:l])
545
546 if t not in shaded_background :
547 shaded_background.append(t)
548
549 font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / l)
550
551 annot_file.write(''.join(['\t'.join([t, 'annotation_background_color', args.background_color]), '\n']))
552 annot_file.write(''.join(['\t'.join([t, 'annotation', '*']), '\n']))
553 annot_file.write(''.join(['\t'.join([t, 'annotation_font_size', str(font_size)]), '\n']))
554
555 # put a bakcground annotation to the clades specified by the user
556 for c in background_colors :
557 bg_color = background_colors[c]
558
559 if not bg_color.startswith('#') :
560 bg_color = bg_color.replace('(', '').replace(')', '')
561 h, s, v = bg_color.split(';')
562 bg_color = scale_color((float(h.strip()) , float(s.strip()), float(v.strip())))
563
564 # check if the taxonomy has more than one level
565 lvls = [str(cc.strip()) for cc in c.split('.')]
566
567 for l in lvls :
568 if (l in taxonomy) and (l not in done_clades) :
569 lvl = taxonomy[:taxonomy.index(l)].count('.') + 1
570 font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / lvl)
571
572
573 annot_file.write(''.join(['\t'.join([l, 'annotation_background_color', bg_color]), '\n']))
574 annot_file.write(''.join(['\t'.join([l, 'annotation', '*']), '\n']))
575 annot_file.write(''.join(['\t'.join([l, 'annotation_font_size', str(font_size)]), '\n']))
576
577 done_clades.append(l)
578
579 if lefse_output :
580 if taxonomy in lefse_output :
581 es, bk, _, _ = lefse_output[taxonomy]
582
583 # if it is a biomarker then color and label it!
584 if bk :
585 fac = abs(log(float(es) / max_effect_size)) / max_log_effect_size
586
587 try :
588 rgbs = scale_color(colors[color[bk]], fac)
589 except Exception as e :
590 print e
591 print ' '.join(["[W] Assign to", taxonomy, "the default color:", colors[color[bk]]])
592 rgbs = colors[color[bk]]
593
594 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_color', rgbs]), '\n']))
595
596 # write the annotation only if the abundance is above a given threshold and it is either internal or external annotation lists
597 if (scaled >= args.abundance_threshold) and ((level in annotations_list) or (level in external_annotations_list)) :
598 font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / level)
599 annotation = '*' if level in annotations_list else '*:*'
600
601 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation_background_color', rgbs]), '\n']))
602 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation', annotation]), '\n']))
603 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation_font_size', str(font_size)]), '\n']))
604 except Exception as e :
605 print e
606
607
608 if __name__ == '__main__' :
609 main()