annotate export2graphlan.py @ 0:cac6247cb1d3 draft

graphlan_import
author george-weingart
date Tue, 26 Aug 2014 14:51:29 -0400
parents
children 82fb838d02dc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
1 #!/usr/bin/env python
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
2
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
3 from argparse import ArgumentParser
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
4 from colorsys import hsv_to_rgb
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
5 from math import log
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
6 from StringIO import StringIO
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
7 from re import compile
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
8 from hclust2.hclust2 import DataMatrix
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
9
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
10
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
11 __author__ = 'Francesco Asnicar'
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
12 __email__ = "francesco.asnicar@gmail.com"
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
13 __version__ = '0.13'
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
14 __date__ = '29th July 2014'
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
15
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
16
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
17 def scale_color((h, s, v), factor=1.):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
18 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
19 Takes as input a tuple that represents a color in HSV format, and optionally a scale factor.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
20 Return an RGB string that is the converted HSV color, scaled by the given factor.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
21 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
22 if (h < 0.) or (h > 360.):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
23 raise Exception('[scale_color()] Hue value out of range (0, 360): ' + str(h))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
24
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
25 if (s < 0.) or (s > 100.):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
26 raise Exception('[scale_color()] Saturation value out of range (0, 100): ' + str(s))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
27
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
28 if (v < 0.) or (v > 100.):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
29 raise Exception('[scale_color()] Value value out of range (0, 100): ' + str(v))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
30
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
31 if (factor < 0.) or (factor > 1.):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
32 raise Exception('[scale_color()] Factor value out of range (0.0, 1.0): ' + str(factor))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
33
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
34 v *= factor
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
35 r, g, b = hsv_to_rgb(h/360., s/100., v/100.)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
36
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
37 return '#{0:02x}{1:02x}{2:02x}'.format(int(round(r*255.)), int(round(g*255.)), int(round(b*255.)))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
38
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
39
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
40 def read_params():
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
41 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
42 Parse the input parameters, performing some validity check.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
43 Return the parsed arguments.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
44 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
45 parser = ArgumentParser(description="export2graphlan.py (ver. " + __version__ +
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
46 " of " + __date__ + "). Convert MetaPhlAn, LEfSe, and/or HUMAnN output to GraPhlAn input format. Authors: "
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
47 + __author__ + " (" + __email__ + ")")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
48
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
49 # input parameters group
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
50 group = parser.add_argument_group(title='input parameters',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
51 description="You need to provide at least one of the two arguments")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
52 group.add_argument('-i', '--lefse_input',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
53 type=str,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
54 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
55 help="LEfSe input data")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
56 group.add_argument('-o', '--lefse_output',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
57 type=str,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
58 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
59 help="LEfSe output result data")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
60
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
61 # output parameters group
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
62 group = parser.add_argument_group(title='output parameters')
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
63 group.add_argument('-t', '--tree',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
64 type=str,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
65 required=True,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
66 help="Output filename where save the input tree for GraPhlAn")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
67 group.add_argument('-a', '--annotation',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
68 type=str,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
69 required=True,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
70 help="Output filename where save GraPhlAn annotation")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
71
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
72 # annotations
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
73 parser.add_argument('--annotations',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
74 default=None,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
75 type=str,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
76 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
77 help="List which levels should be annotated in the tree. Use a comma separate values form, e.g., --annotation_levels 1,2,3. Default is None")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
78 parser.add_argument('--external_annotations',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
79 default=None,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
80 type=str,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
81 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
82 help="List which levels should use the external legend for the annotation. Use a comma separate values form, e.g., --annotation_levels 1,2,3. Default is None")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
83 # shaded background
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
84 parser.add_argument('--background_levels',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
85 default=None,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
86 type=str,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
87 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
88 help="List which levels should be highlight with a shaded background. Use a comma separate values form, e.g., --background_levels 1,2,3")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
89 parser.add_argument('--background_clades',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
90 default=None,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
91 type=str,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
92 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
93 help="Specify the clades that should be highlight with a shaded background. Use a comma separate values form and surround the string with \" if it contains spaces. Example: --background_clades \"Bacteria.Actinobacteria, Bacteria.Bacteroidetes.Bacteroidia, Bacteria.Firmicutes.Clostridia.Clostridiales\"")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
94 parser.add_argument('--background_colors',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
95 default=None,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
96 type=str,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
97 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
98 help="Set the color to use for the shaded background. Colors can be either in RGB or HSV (using a semi-colon to separate values, surrounded with ()) format. Use a comma separate values form and surround the string with \" if it contains spaces. Example: --background_colors \"#29cc36, (150; 100; 100), (280; 80; 88)\"")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
99 # title
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
100 parser.add_argument('--title',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
101 type=str,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
102 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
103 help="If specified set the title of the GraPhlAn plot. Surround the string with \" if it contains spaces, e.g., --title \"Title example\"")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
104 # title font size
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
105 parser.add_argument('--title_font_size',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
106 default=15,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
107 type=int,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
108 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
109 help="Set the title font size. Default is 15")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
110 # clade size
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
111 parser.add_argument('--def_clade_size',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
112 default=10.,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
113 type=float,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
114 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
115 help="Set a default size for clades that are not found as biomarkers by LEfSe. Default is 10")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
116 parser.add_argument('--min_clade_size',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
117 default=20.,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
118 type=float,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
119 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
120 help="Set the minimum value of clades that are biomarkers. Default is 20")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
121 parser.add_argument('--max_clade_size',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
122 default=200.,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
123 type=float,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
124 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
125 help="Set the maximum value of clades that are biomarkers. Default is 200")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
126 # font size
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
127 parser.add_argument('--def_font_size',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
128 default=10,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
129 type=int,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
130 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
131 help="Set a default font size. Default is 10")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
132 parser.add_argument('--min_font_size',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
133 default=8,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
134 type=int,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
135 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
136 help="Set the minimum font size to use. Default is 8")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
137 parser.add_argument('--max_font_size',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
138 default=12,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
139 type=int,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
140 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
141 help="Set the maximum font size. Default is 12")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
142 # legend font size
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
143 parser.add_argument('--annotation_legend_font_size',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
144 default=10,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
145 type=int,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
146 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
147 help="Set the font size for the annotation legend. Default is 10")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
148 # abundance threshold
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
149 parser.add_argument('--abundance_threshold',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
150 default=20.,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
151 type=float,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
152 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
153 help="Set the minimun abundace value for a clade to be annotated. Default is 20.0")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
154 # ONLY lefse_input provided
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
155 parser.add_argument('--most_abundant',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
156 default=10,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
157 type=int,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
158 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
159 help="When only lefse_input is provided, you can specify how many clades highlight. Since the biomarkers are missing, they will be chosen from the most abundant")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
160 parser.add_argument('--least_biomarkers',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
161 default=3,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
162 type=int,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
163 required=False,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
164 help="When only lefse_input is provided, you can specify the minimum number of biomarkers to extract. The taxonomy is parsed, and the level is choosen in order to have at least the specified number of biomarkers")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
165 # decide to keep the OTU id or to merger at the above taxonomic level
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
166 parser.add_argument('--discard_otus',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
167 default=True,
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
168 action='store_false',
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
169 help="If specified the OTU ids will be discarde from the taxonmy. Default behavior keep OTU ids in taxonomy")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
170
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
171 DataMatrix.input_parameters(parser)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
172 args = parser.parse_args()
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
173
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
174 # check if at least one of the input params is given
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
175 if (not args.lefse_input) and (not args.lefse_output) :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
176 raise Exception("[read_params()] You must provide at least one of the two input parameters: ")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
177
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
178 # check that min_clade_size is less than max_clade_size
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
179 if args.min_clade_size > args.max_clade_size :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
180 print "[W] min_clade_size cannot be greater than max_clade_size, assigning their default values"
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
181 args.min_clade_size = 20.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
182 args.max_clade_size = 200.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
183
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
184 # check that min_font_size is less than max_font_size
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
185 if args.min_font_size > args.max_font_size :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
186 print "[W] min_font_size cannot be greater than max_font_size, assigning their default values"
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
187 args.min_font_size = 8
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
188 args.max_font_size = 12
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
189
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
190 return args
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
191
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
192
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
193 def get_file_type(filename):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
194 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
195 Return the extension (if any) of the ``filename`` in lower case.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
196 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
197 return filename[filename.rfind('.')+1:].lower()
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
198
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
199
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
200 def parse_biom(filename, keep_otus=True):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
201 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
202 Load a biom table and extract the taxonomy (from metadata), removing the unuseful header.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
203 Return the input biom in tab-separated format.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
204 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
205 from biom import load_table # avoid to ask for the BIOM library if there is no biom file
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
206
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
207 biom_table = load_table(filename)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
208 strs = biom_table.delimited_self(header_value='TAXA', header_key='taxonomy')
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
209 lst1 = [str(s) for s in strs.split('\n')[1:]] # skip the "# Constructed from biom file" entry
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
210 biom_file = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
211 out = [lst1[0]] # save the header
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
212 pre_taxa = compile(".__")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
213 classs = compile("\(class\)")
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
214
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
215 # consistency check
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
216 i = 0
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
217 while i < (len(lst1)-1):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
218 if len([s for s in lst1[i].split('\t')]) != len([s for s in lst1[i+1].split('\t')]) :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
219 raise Exception('[parse_biom()] It seems that taxonomic metadata are missing, maybe is the wrong biom file?')
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
220
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
221 i += 1
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
222
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
223 for l in lst1[1:]:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
224 otu = None
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
225 lst = [str(s).strip() for s in l.split('\t')[1:-1]]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
226
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
227 if keep_otus:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
228 otu = l.split('\t')[0]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
229
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
230 # Clean an move taxa in first place
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
231 taxa = '.'.join([s.strip().replace('[', '').replace('u\'', '').replace(']', '').replace(' ', '').replace('\'', '') for s in l.split('\t')[-1].split(',')])
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
232 taxa = pre_taxa.sub('', taxa) # remove '{k|p|o|g|s}__'
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
233 taxa = classs.sub('', taxa) # remove '(class)'
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
234 taxa = taxa.rstrip('.') # remove trailing dots
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
235
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
236 if otu:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
237 taxa = taxa + '.' + otu
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
238
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
239 biom_file.append([taxa] + lst)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
240
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
241 # merge such rows that have the same taxa
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
242 i = 1
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
243 dic = {}
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
244
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
245 for l in biom_file[i:]:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
246 for k in biom_file[i+1:]:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
247 if l[0] == k[0]:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
248 lst = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
249 j = 1
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
250
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
251 while j < len(l):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
252 lst.append(float(l[j]) + float(k[j]))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
253 j += 1
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
254
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
255 if l[0] in dic:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
256 lst1 = dic[l[0]]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
257 j = 0
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
258 lst2 = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
259
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
260 while j < len(lst1):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
261 lst2.append(float(lst1[j]) + float(lst[j]))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
262 j += 1
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
263
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
264 lst = lst2
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
265
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
266 dic[l[0]] = lst
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
267 # if not in dic, add it!
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
268 if l[0] not in dic:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
269 dic[l[0]] = l[1:]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
270
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
271 i += 1
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
272
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
273 for k in dic:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
274 out.append('\t'.join([str(s) for s in [k] + dic[k]]))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
275
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
276 return '\n'.join(out)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
277
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
278
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
279 def get_most_abundant(abundances, xxx):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
280 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
281 Sort by the abundance level all the taxonomy that represent at least two levels.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
282 Return the first ``xxx`` most abundant.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
283 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
284 abundant = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
285
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
286 for a in abundances :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
287 if a.count('|') > 0:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
288 abundant.append((float(abundances[a]), a.replace('|', '.')))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
289 elif a.count('.') > 0:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
290 abundant.append((float(abundances[a]), a))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
291
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
292 abundant.sort(reverse=True)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
293 return abundant[:xxx]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
294
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
295
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
296 def get_biomarkes(abundant, xxx):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
297 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
298 Split the taxonomy and then look, level by level, when there are at least ``xxx`` distinct branches.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
299 Return the set of branches as biomarkers to highlight.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
300 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
301 cc = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
302 bk = set()
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
303 lvl = 0
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
304
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
305 for _, t in abundant:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
306 cc.append(t.split('.'))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
307
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
308 while lvl < len(max(cc)):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
309 bk = set()
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
310
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
311 for c in cc:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
312 if lvl < len(c):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
313 bk |= set([c[lvl]])
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
314
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
315 if len(bk) >= xxx:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
316 break
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
317
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
318 lvl += 1
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
319
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
320 return bk
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
321
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
322 def scale_clade_size(minn, maxx, abu, max_abu):
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
323 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
324 Return the value of ``abu`` scaled to ``max_abu`` logarithmically, and then map from ``minn`` to ``maxx``.
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
325 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
326 return minn + maxx*log(1. + (abu/max_abu))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
327
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
328
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
329 def main():
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
330 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
331 """
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
332 colors = [(245., 90., 100.), (125., 80., 80.), (0., 80., 100.), (195., 100., 100.), (150., 100., 100.), (55., 100., 100.), (280., 80., 88.)] # HSV format
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
333 args = read_params()
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
334 lefse_input = None
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
335 lefse_output = {}
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
336 color = {}
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
337 biomarkers = set()
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
338 taxa = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
339 abundances = {}
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
340 max_abundances = None
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
341 max_effect_size = None
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
342 max_log_effect_size = None
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
343 background_list = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
344 background_clades = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
345 background_colors = {}
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
346 annotations_list = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
347 external_annotations_list = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
348 lin = False
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
349 lout = False
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
350
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
351 # get the levels that should be shaded
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
352 if args.background_levels :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
353 background_list = [int(i.strip()) for i in args.background_levels.strip().split(',')]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
354
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
355 # get the background_clades
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
356 if args.background_clades :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
357 if get_file_type(args.background_colors) in ['txt'] :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
358 with open(args.background_clades, 'r') as f:
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
359 background_clades = [str(s.strip()) for s in f]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
360 else : # it's a string in csv format
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
361 background_clades = [str(s.strip()) for s in args.background_clades.split(',')]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
362
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
363 # read the set of colors to use for the background_clades
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
364 if args.background_colors :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
365 col = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
366
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
367 if get_file_type(args.background_colors) in ['txt'] :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
368 with open(args.background_colors, 'r') as f :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
369 col = [str(s.strip()) for s in f]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
370 else : # it's a string in csv format
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
371 col = [c.strip() for c in args.background_colors.split(',')]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
372
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
373 lst = {}
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
374 i = 0
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
375
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
376 for c in background_clades :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
377 cc = c[:c.find('.')]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
378
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
379 if cc not in lst :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
380 background_colors[c] = col[i % len(col)]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
381 lst[cc] = col[i % len(col)]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
382 i += 1
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
383 else :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
384 background_colors[c] = lst[cc]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
385
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
386 # get the levels that will use the internal annotation
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
387 if args.annotations :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
388 annotations_list = [int(i.strip()) for i in args.annotations.strip().split(',')]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
389
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
390 # get the levels that will use the external legend annotation
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
391 if args.external_annotations :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
392 external_annotations_list = [int(i.strip()) for i in args.external_annotations.strip().split(',')]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
393
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
394 if args.lefse_input :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
395 # if the lefse_input is in biom format, convert it
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
396 if get_file_type(args.lefse_input) in 'biom' :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
397 try :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
398 biom = parse_biom(args.lefse_input, args.discard_otus)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
399 lefse_input = DataMatrix(StringIO(biom), args)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
400 except Exception as e :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
401 lin = True
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
402 print e
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
403 else :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
404 lefse_input = DataMatrix(args.lefse_input, args)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
405
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
406 if not lin :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
407 taxa = [t.replace('|', '.') for t in lefse_input.get_fnames()] # build taxonomy list
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
408 abundances = dict(lefse_input.get_averages())
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
409 max_abundances = max([abundances[x] for x in abundances])
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
410 else : # no lefse_input provided
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
411 lin = True
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
412
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
413 if args.lefse_output :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
414 # if the lefse_output is in biom format... I don't think it's possible!
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
415 if get_file_type(args.lefse_output) in 'biom' :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
416 lout = True
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
417 print "Seriously?? LEfSe output file is not expected to be in biom format!"
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
418 else :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
419 lst = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
420
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
421 with open(args.lefse_output, 'r') as out_file :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
422 for line in out_file :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
423 t, m, bk, es, pv = line.strip().split('\t')
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
424 lefse_output[t] = (es, bk, m, pv)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
425
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
426 # get distinct biomarkers
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
427 if bk :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
428 biomarkers |= set([bk])
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
429
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
430 # get all effect size
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
431 if es :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
432 lst.append(float(es))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
433
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
434 max_effect_size = max(lst)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
435
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
436 # no lefse_input file provided!
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
437 if (not taxa) and (not abundances) : # build taxonomy list and abundaces map
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
438 for t in lefse_output :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
439 _, _, m, _ = lefse_output[t]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
440 abundances[t.replace('.', '|')] = float(m)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
441
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
442 max_abundances = max([abundances[x] for x in abundances])
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
443
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
444 for t in lefse_output :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
445 scaled = scale_clade_size(args.min_clade_size, args.max_clade_size, abundances[t.replace('.', '|')], max_abundances)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
446
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
447 if scaled >= args.abundance_threshold :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
448 taxa.append(t)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
449 elif not lin : # no lefse_output provided and lefse_input correctly red
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
450 lout = True
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
451
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
452 # find the xxx most abundant
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
453 abundant = get_most_abundant(abundances, args.most_abundant)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
454
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
455 # find the taxonomy level with at least yyy distinct childs from the xxx most abundant
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
456 biomarkers = get_biomarkes(abundant, args.least_biomarkers)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
457
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
458 # compose lefse_output variable
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
459 for _, t in abundant :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
460 b = ''
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
461
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
462 for bk in biomarkers :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
463 if bk in t :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
464 b = bk
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
465
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
466 lefse_output[t] = (2., b, '', '')
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
467
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
468 max_effect_size = 1. # It's not gonna working
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
469
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
470
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
471 # no lefse_output and no lefse_input provided
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
472 if lin and lout :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
473 print "You must provide at least one input file!"
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
474 exit(1)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
475
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
476 # write the tree
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
477 with open(args.tree, 'w') as tree_file :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
478 for taxonomy in taxa :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
479 tree_file.write(''.join([taxonomy, '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
480
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
481 # for each biomarker assign it to a different color
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
482 i = 0
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
483
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
484 for bk in biomarkers :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
485 color[bk] = i % len(colors)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
486 i += 1
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
487
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
488 # find max log abs value of effect size
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
489 if lefse_output :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
490 lst = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
491
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
492 for t in lefse_output :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
493 es, _, _, _ = lefse_output[t]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
494
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
495 if es :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
496 lst.append(abs(log(float(es) / max_effect_size)))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
497
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
498 max_log_effect_size = max(lst)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
499
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
500 # write the annotation
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
501 try :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
502 with open(args.annotation, 'w') as annot_file :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
503 # set the title
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
504 if args.title :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
505 annot_file.write(''.join(['\t'.join(['title', args.title]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
506 annot_file.write(''.join(['\t'.join(['title_font_size', str(args.title_font_size)]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
507
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
508 # write some basic customizations
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
509 annot_file.write(''.join(['\t'.join(['clade_separation', '0.5']), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
510 annot_file.write(''.join(['\t'.join(['branch_bracket_depth', '0.8']), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
511 annot_file.write(''.join(['\t'.join(['branch_bracket_width', '0.2']), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
512 annot_file.write(''.join(['\t'.join(['annotation_legend_font_size', str(args.annotation_legend_font_size)]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
513 annot_file.write(''.join(['\t'.join(['class_legend_font_size', '10']), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
514 annot_file.write(''.join(['\t'.join(['class_legend_marker_size', '1.5']), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
515
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
516 # write the biomarkers' legend
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
517 for bk in biomarkers :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
518 biom = bk.replace('_', ' ').upper()
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
519 rgb = scale_color(colors[color[bk]])
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
520 annot_file.write(''.join(['\t'.join([biom, 'annotation', biom]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
521 annot_file.write(''.join(['\t'.join([biom, 'clade_marker_color', rgb]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
522 annot_file.write(''.join(['\t'.join([biom, 'clade_marker_size', '40']), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
523
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
524 done_clades = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
525
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
526 # write the annotation for the tree
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
527 for taxonomy in taxa :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
528 level = taxonomy.count('.') + 1 # which level is this taxonomy?
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
529 clean_taxonomy = taxonomy[taxonomy.rfind('.') + 1:] # retrieve the last level in taxonomy
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
530 scaled = args.def_clade_size
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
531
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
532 # scaled the size of the clade by the average abundance
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
533 if taxonomy.replace('.', '|') in abundances :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
534 scaled = scale_clade_size(args.min_clade_size, args.max_clade_size, abundances[taxonomy.replace('.', '|')], max_abundances)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
535
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
536 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_size', str(scaled)]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
537
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
538 # put a bakcground annotation to the levels specified by the user
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
539 shaded_background = []
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
540
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
541 for l in background_list :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
542 if level >= l :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
543 lst = [s.strip() for s in taxonomy.strip().split('.')]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
544 t = '.'.join(lst[:l])
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
545
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
546 if t not in shaded_background :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
547 shaded_background.append(t)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
548
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
549 font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / l)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
550
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
551 annot_file.write(''.join(['\t'.join([t, 'annotation_background_color', args.background_color]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
552 annot_file.write(''.join(['\t'.join([t, 'annotation', '*']), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
553 annot_file.write(''.join(['\t'.join([t, 'annotation_font_size', str(font_size)]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
554
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
555 # put a bakcground annotation to the clades specified by the user
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
556 for c in background_colors :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
557 bg_color = background_colors[c]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
558
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
559 if not bg_color.startswith('#') :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
560 bg_color = bg_color.replace('(', '').replace(')', '')
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
561 h, s, v = bg_color.split(';')
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
562 bg_color = scale_color((float(h.strip()) , float(s.strip()), float(v.strip())))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
563
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
564 # check if the taxonomy has more than one level
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
565 lvls = [str(cc.strip()) for cc in c.split('.')]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
566
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
567 for l in lvls :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
568 if (l in taxonomy) and (l not in done_clades) :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
569 lvl = taxonomy[:taxonomy.index(l)].count('.') + 1
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
570 font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / lvl)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
571
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
572
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
573 annot_file.write(''.join(['\t'.join([l, 'annotation_background_color', bg_color]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
574 annot_file.write(''.join(['\t'.join([l, 'annotation', '*']), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
575 annot_file.write(''.join(['\t'.join([l, 'annotation_font_size', str(font_size)]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
576
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
577 done_clades.append(l)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
578
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
579 if lefse_output :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
580 if taxonomy in lefse_output :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
581 es, bk, _, _ = lefse_output[taxonomy]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
582
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
583 # if it is a biomarker then color and label it!
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
584 if bk :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
585 fac = abs(log(float(es) / max_effect_size)) / max_log_effect_size
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
586
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
587 try :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
588 rgbs = scale_color(colors[color[bk]], fac)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
589 except Exception as e :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
590 print e
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
591 print ' '.join(["[W] Assign to", taxonomy, "the default color:", colors[color[bk]]])
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
592 rgbs = colors[color[bk]]
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
593
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
594 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_color', rgbs]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
595
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
596 # write the annotation only if the abundance is above a given threshold and it is either internal or external annotation lists
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
597 if (scaled >= args.abundance_threshold) and ((level in annotations_list) or (level in external_annotations_list)) :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
598 font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / level)
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
599 annotation = '*' if level in annotations_list else '*:*'
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
600
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
601 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation_background_color', rgbs]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
602 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation', annotation]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
603 annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation_font_size', str(font_size)]), '\n']))
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
604 except Exception as e :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
605 print e
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
606
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
607
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
608 if __name__ == '__main__' :
cac6247cb1d3 graphlan_import
george-weingart
parents:
diff changeset
609 main()