graphlan_import: export2graphlan.py comparison

comparison export2graphlan.py @ 28:82fb838d02dc draft

Uploaded updated version of the programs

author	george-weingart
date	Fri, 05 Sep 2014 14:00:09 -0400
parents	cac6247cb1d3
children

comparison

equal deleted inserted replaced

-:252c242518f2
+:82fb838d02dc
 from hclust2.hclust2 import DataMatrix
 __author__ = 'Francesco Asnicar'
 __email__ = "francesco.asnicar@gmail.com"
-__version__ = '0.13'
+__version__ = '0.17'
-__date__ = '29th July 2014'
+__date__ = '21th August 2014'
 def scale_color((h, s, v), factor=1.):
-	"""
+"""
-	Takes as input a tuple that represents a color in HSV format, and optionally a scale factor.
+Takes as input a tuple that represents a color in HSV format, and optionally a scale factor.
-	Return an RGB string that is the converted HSV color, scaled by the given factor.
+Return an RGB string that is the converted HSV color, scaled by the given factor.
-	"""
+"""
-	if (h < 0.) or (h > 360.):
+if (h < 0.) or (h > 360.):
-		raise Exception('[scale_color()] Hue value out of range (0, 360): ' + str(h))
+raise Exception('[scale_color()] Hue value out of range (0, 360): ' + str(h))
-	if (s < 0.) or (s > 100.):
+if (s < 0.) or (s > 100.):
-		raise Exception('[scale_color()] Saturation value out of range (0, 100): ' + str(s))
+raise Exception('[scale_color()] Saturation value out of range (0, 100): ' + str(s))
-	if (v < 0.) or (v > 100.):
+if (v < 0.) or (v > 100.):
-		raise Exception('[scale_color()] Value value out of range (0, 100): ' + str(v))
+raise Exception('[scale_color()] Value value out of range (0, 100): ' + str(v))
-	if (factor < 0.) or (factor > 1.):
+if (factor < 0.) or (factor > 1.):
-		raise Exception('[scale_color()] Factor value out of range (0.0, 1.0): ' + str(factor))
+raise Exception('[scale_color()] Factor value out of range (0.0, 1.0): ' + str(factor))
-	v *= factor
+v *= factor
-	r, g, b = hsv_to_rgb(h/360., s/100., v/100.)
+r, g, b = hsv_to_rgb(h/360., s/100., v/100.)
-	return '#{0:02x}{1:02x}{2:02x}'.format(int(round(r*255.)), int(round(g*255.)), int(round(b*255.)))
+return '#{0:02x}{1:02x}{2:02x}'.format(int(round(r*255.)), int(round(g*255.)), int(round(b*255.)))
 def read_params():
-	"""
+"""
-	Parse the input parameters, performing some validity check.
+Parse the input parameters, performing some validity check.
-	Return the parsed arguments.
+Return the parsed arguments.
-	"""
+"""
-	parser = ArgumentParser(description="export2graphlan.py (ver. " + __version__ +
+parser = ArgumentParser(description="export2graphlan.py (ver. " + __version__ +
-		" of " + __date__ + "). Convert MetaPhlAn, LEfSe, and/or HUMAnN output to GraPhlAn input format. Authors: "
+" of " + __date__ + "). Convert MetaPhlAn, LEfSe, and/or HUMAnN output to GraPhlAn input format. Authors: "
-		+ __author__ + " (" + __email__ + ")")
++ __author__ + " (" + __email__ + ")")
-	# input parameters group
+# input parameters group
-	group = parser.add_argument_group(title='input parameters',
+group = parser.add_argument_group(title='input parameters',
-		description="You need to provide at least one of the two arguments")
+description="You need to provide at least one of the two arguments")
-	group.add_argument('-i', '--lefse_input',
+group.add_argument('-i', '--lefse_input',
-		type=str,
+type=str,
-		required=False,
+required=False,
-		help="LEfSe input data")
+help="LEfSe input data. A file that can be given to LEfSe for biomarkers analysis. It can be the result of a "
-	group.add_argument('-o', '--lefse_output',
+"MetaPhlAn or HUMAnN analysis")
-		type=str,
+group.add_argument('-o', '--lefse_output',
-		required=False,
+type=str,
-		help="LEfSe output result data")
+required=False,
+help="LEfSe output result data. The result of LEfSe analysis performed on the lefse_input file")
-	# output parameters group
-	group = parser.add_argument_group(title='output parameters')
+# output parameters group
-	group.add_argument('-t', '--tree',
+group = parser.add_argument_group(title='output parameters')
-		type=str,
+group.add_argument('-t', '--tree',
-		required=True,
+type=str,
-		help="Output filename where save the input tree for GraPhlAn")
+required=True,
-	group.add_argument('-a', '--annotation',
+help="Output filename where save the input tree for GraPhlAn")
-		type=str,
+group.add_argument('-a', '--annotation',
-		required=True,
+type=str,
-		help="Output filename where save GraPhlAn annotation")
+required=True,
+help="Output filename where save GraPhlAn annotation")
-	# annotations
-	parser.add_argument('--annotations',
+# annotations
-		default=None,
+parser.add_argument('--annotations',
-		type=str,
+default=None,
-		required=False,
+type=str,
-		help="List which levels should be annotated in the tree. Use a comma separate values form, e.g., --annotation_levels 1,2,3. Default is None")
+required=False,
-	parser.add_argument('--external_annotations',
+help="List which levels should be annotated in the tree. Use a comma separate values form, e.g., "
-		default=None,
+"--annotation_levels 1,2,3. Default is None")
-		type=str,
+parser.add_argument('--external_annotations',
-		required=False,
+default=None,
-		help="List which levels should use the external legend for the annotation. Use a comma separate values form, e.g., --annotation_levels 1,2,3. Default is None")
+type=str,
-	# shaded background
+required=False,
-	parser.add_argument('--background_levels',
+help="List which levels should use the external legend for the annotation. Use a comma separate values form, "
-		default=None,
+"e.g., --annotation_levels 1,2,3. Default is None")
-		type=str,
+# shaded background
-		required=False,
+parser.add_argument('--background_levels',
-		help="List which levels should be highlight with a shaded background. Use a comma separate values form, e.g., --background_levels 1,2,3")
+default=None,
-	parser.add_argument('--background_clades',
+type=str,
-		default=None,
+required=False,
-		type=str,
+help="List which levels should be highlight with a shaded background. Use a comma separate values form, e.g., "
-		required=False,
+"--background_levels 1,2,3. Default is None")
-		help="Specify the clades that should be highlight with a shaded background. Use a comma separate values form and surround the string with \" if it contains spaces. Example: --background_clades \"Bacteria.Actinobacteria, Bacteria.Bacteroidetes.Bacteroidia, Bacteria.Firmicutes.Clostridia.Clostridiales\"")
+parser.add_argument('--background_clades',
-	parser.add_argument('--background_colors',
+default=None,
-		default=None,
+type=str,
-		type=str,
+required=False,
-		required=False,
+help="Specify the clades that should be highlight with a shaded background. Use a comma separate values form "
-		help="Set the color to use for the shaded background. Colors can be either in RGB or HSV (using a semi-colon to separate values, surrounded with ()) format. Use a comma separate values form and surround the string with \" if it contains spaces. Example: --background_colors \"#29cc36, (150; 100; 100), (280; 80; 88)\"")
+"and surround the string with \" if there are spaces. Example: --background_clades \"Bacteria.Actinobacteria, "
-	# title
+"Bacteria.Bacteroidetes.Bacteroidia, Bacteria.Firmicutes.Clostridia.Clostridiales\". Default is None")
-	parser.add_argument('--title',
+parser.add_argument('--background_colors',
-		type=str,
+default=None,
-		required=False,
+type=str,
-		help="If specified set the title of the GraPhlAn plot. Surround the string with \" if it contains spaces, e.g., --title \"Title example\"")
+required=False,
-	# title font size
+help="Set the color to use for the shaded background. Colors can be either in RGB or HSV (using a semi-colon to "
-	parser.add_argument('--title_font_size',
+"separate values, surrounded with ()) format. Use a comma separate values form and surround the string with "
-		default=15,
+"\" if it contains spaces. Example: --background_colors \"#29cc36, (150; 100; 100), (280; 80; 88)\". Default "
-		type=int,
+"is None")
-		required=False,
+# title
-		help="Set the title font size. Default is 15")
+parser.add_argument('--title',
-	# clade size
+type=str,
-	parser.add_argument('--def_clade_size',
+required=False,
-		default=10.,
+help="If specified set the title of the GraPhlAn plot. Surround the string with \" if it contains spaces, e.g., "
-		type=float,
+"--title \"Title example\"")
-		required=False,
+# title font size
-		help="Set a default size for clades that are not found as biomarkers by LEfSe. Default is 10")
+parser.add_argument('--title_font_size',
-	parser.add_argument('--min_clade_size',
+default=15,
-		default=20.,
+type=int,
-		type=float,
+required=False,
-		required=False,
+help="Set the title font size. Default is 15")
-		help="Set the minimum value of clades that are biomarkers. Default is 20")
+# clade size
-	parser.add_argument('--max_clade_size',
+parser.add_argument('--def_clade_size',
-		default=200.,
+default=10.,
-		type=float,
+type=float,
-		required=False,
+required=False,
-		help="Set the maximum value of clades that are biomarkers. Default is 200")
+help="Set a default size for clades that are not found as biomarkers by LEfSe. Default is 10")
-	# font size
+parser.add_argument('--min_clade_size',
-	parser.add_argument('--def_font_size',
+default=20.,
-		default=10,
+type=float,
-		type=int,
+required=False,
-		required=False,
+help="Set the minimum value of clades that are biomarkers. Default is 20")
-		help="Set a default font size. Default is 10")
+parser.add_argument('--max_clade_size',
-	parser.add_argument('--min_font_size',
+default=200.,
-		default=8,
+type=float,
-		type=int,
+required=False,
-		required=False,
+help="Set the maximum value of clades that are biomarkers. Default is 200")
-		help="Set the minimum font size to use. Default is 8")
+# font size
-	parser.add_argument('--max_font_size',
+parser.add_argument('--def_font_size',
-		default=12,
+default=10,
-		type=int,
+type=int,
-		required=False,
+required=False,
-		help="Set the maximum font size. Default is 12")
+help="Set a default font size. Default is 10")
-	# legend font size
+parser.add_argument('--min_font_size',
-	parser.add_argument('--annotation_legend_font_size',
+default=8,
-		default=10,
+type=int,
-		type=int,
+required=False,
-		required=False,
+help="Set the minimum font size to use. Default is 8")
-		help="Set the font size for the annotation legend. Default is 10")
+parser.add_argument('--max_font_size',
-	# abundance threshold
+default=12,
-	parser.add_argument('--abundance_threshold',
+type=int,
-		default=20.,
+required=False,
-		type=float,
+help="Set the maximum font size. Default is 12")
-		required=False,
+# legend font size
-		help="Set the minimun abundace value for a clade to be annotated. Default is 20.0")
+parser.add_argument('--annotation_legend_font_size',
-	# ONLY lefse_input provided
+default=10,
-	parser.add_argument('--most_abundant',
+type=int,
-		default=10,
+required=False,
-		type=int,
+help="Set the font size for the annotation legend. Default is 10")
-		required=False,
+# abundance threshold
-		help="When only lefse_input is provided, you can specify how many clades highlight. Since the biomarkers are missing, they will be chosen from the most abundant")
+parser.add_argument('--abundance_threshold',
-	parser.add_argument('--least_biomarkers',
+default=20.,
-		default=3,
+type=float,
-		type=int,
+required=False,
-		required=False,
+help="Set the minimun abundace value for a clade to be annotated. Default is 20.0")
-		help="When only lefse_input is provided, you can specify the minimum number of biomarkers to extract. The taxonomy is parsed, and the level is choosen in order to have at least the specified number of biomarkers")
+# ONLY lefse_input provided
-	# decide to keep the OTU id or to merger at the above taxonomic level
+parser.add_argument('--most_abundant',
-	parser.add_argument('--discard_otus',
+default=10,
-		default=True,
+type=int,
-		action='store_false',
+required=False,
-		help="If specified the OTU ids will be discarde from the taxonmy. Default behavior keep OTU ids in taxonomy")
+help="When only lefse_input is provided, you can specify how many clades highlight. Since the biomarkers are "
+"missing, they will be chosen from the most abundant. Default is 10")
-	DataMatrix.input_parameters(parser)
+parser.add_argument('--least_biomarkers',
-	args = parser.parse_args()
+default=3,
+type=int,
-	# check if at least one of the input params is given
+required=False,
-	if (not args.lefse_input) and (not args.lefse_output) :
+help="When only lefse_input is provided, you can specify the minimum number of biomarkers to extract. The "
-		raise Exception("[read_params()] You must provide at least one of the two input parameters: ")
+"taxonomy is parsed, and the level is choosen in order to have at least the specified number of biomarkers. "
+"Default is 3")
-	# check that min_clade_size is less than max_clade_size
+# decide to keep the OTU id or to merger at the above taxonomic level
-	if args.min_clade_size > args.max_clade_size :
+parser.add_argument('--discard_otus',
-		print "[W] min_clade_size cannot be greater than max_clade_size, assigning their default values"
+default=True,
-		args.min_clade_size = 20.
+action='store_false',
-		args.max_clade_size = 200.
+help="If specified the OTU ids will be discarde from the taxonmy. Default is True, i.e. keep OTUs IDs in taxonomy")
+# decide to keep the OTU id or to merger at the above taxonomic level
-	# check that min_font_size is less than max_font_size
+parser.add_argument('--internal_levels',
-	if args.min_font_size > args.max_font_size :
+default=False,
-		print "[W] min_font_size cannot be greater than max_font_size, assigning their default values"
+action='store_true',
-		args.min_font_size = 8
+help="If specified sum-up from leaf to root the abundances values. Default is False, i.e. do not sum-up abundances "
-		args.max_font_size = 12
+"on the internal nodes")
-	return args
+DataMatrix.input_parameters(parser)
+args = parser.parse_args()
+# check if at least one of the input params is given
+if (not args.lefse_input) and (not args.lefse_output):
+raise Exception("[read_params()] You must provide at least one of the two input parameters: ")
+# check that min_clade_size is less than max_clade_size
+if args.min_clade_size > args.max_clade_size:
+print "[W] min_clade_size cannot be greater than max_clade_size, assigning their default values"
+args.min_clade_size = 20.
+args.max_clade_size = 200.
+# check that min_font_size is less than max_font_size
+if args.min_font_size > args.max_font_size:
+print "[W] min_font_size cannot be greater than max_font_size, assigning their default values"
+args.min_font_size = 8
+args.max_font_size = 12
+return args
 def get_file_type(filename):
-	"""
+"""
-	Return the extension (if any) of the ``filename`` in lower case.
+Return the extension (if any) of the ``filename`` in lower case.
-	"""
+"""
-	return filename[filename.rfind('.')+1:].lower()
+return filename[filename.rfind('.')+1:].lower()
-def parse_biom(filename, keep_otus=True):
+def parse_biom(filename, keep_otus=True, internal_levels=False):
-	"""
+"""
-	Load a biom table and extract the taxonomy (from metadata), removing the unuseful header.
+Load a biom table and extract the taxonomy (from metadata), removing the unuseful header.
-	Return the input biom in tab-separated format.
+Return the input biom in tab-separated format.
-	"""
+"""
-	from biom import load_table # avoid to ask for the BIOM library if there is no biom file
+from biom import load_table # avoid to ask for the BIOM library if there is no biom file
-	biom_table = load_table(filename)
+biom_table = load_table(filename)
-	strs = biom_table.delimited_self(header_value='TAXA', header_key='taxonomy')
+strs = biom_table.delimited_self(header_value='TAXA', header_key='taxonomy')
-	lst1 = [str(s) for s in strs.split('\n')[1:]] # skip the "# Constructed from biom file" entry
+lst1 = [str(s) for s in strs.split('\n')[1:]] # skip the "# Constructed from biom file" entry
-	biom_file = []
+biom_file = []
-	out = [lst1[0]] # save the header
+out = [lst1[0]] # save the header
-	pre_taxa = compile(".__")
+pre_taxa = compile(".__")
-	classs = compile("\(class\)")
+classs = compile("\(class\)")
-	# consistency check
+# consistency check
-	i = 0
+i = 0
-	while i < (len(lst1)-1):
+while i < (len(lst1)-1):
-		if len([s for s in lst1[i].split('\t')]) != len([s for s in lst1[i+1].split('\t')]) :
+if len([s for s in lst1[i].split('\t')]) != len([s for s in lst1[i+1].split('\t')]):
-			raise Exception('[parse_biom()] It seems that taxonomic metadata are missing, maybe is the wrong biom file?')
+raise Exception('[parse_biom()] It seems that taxonomic metadata are missing, maybe is the wrong biom file?')
-		i += 1
+i += 1
-	for l in lst1[1:]:
+for l in lst1[1:]:
-		otu = None
+otu = None
-		lst = [str(s).strip() for s in l.split('\t')[1:-1]]
+lst = [float(s.strip()) for s in l.split('\t')[1:-1]]
-		if keep_otus:
+if keep_otus:
-			otu = l.split('\t')[0]
+otu = l.split('\t')[0]
-		# Clean an move taxa in first place
+# Clean and move taxa in first place
-		taxa = '.'.join([s.strip().replace('[', '').replace('u\'', '').replace(']', '').replace(' ', '').replace('\'', '') for s in l.split('\t')[-1].split(',')])
+taxa = '.'.join([s.strip().replace('[', '').replace('u\'', '').replace(']', '').replace(' ', '').replace('\'', '')
-		taxa = pre_taxa.sub('', taxa) # remove '{k|p|o|g|s}__'
+for s in l.split('\t')[-1].split(',')])
-		taxa = classs.sub('', taxa) # remove '(class)'
+taxa = pre_taxa.sub('', taxa) # remove '{k|p|o|g|s}__'
-		taxa = taxa.rstrip('.') # remove trailing dots
+taxa = classs.sub('', taxa) # remove '(class)'
+taxa = taxa.rstrip('.') # remove trailing dots
-		if otu:
-			taxa = taxa + '.' + otu
+if otu:
+taxa = taxa + '.' + otu
-		biom_file.append([taxa] + lst)
+biom_file.append([taxa] + lst)
-	# merge such rows that have the same taxa
-	i = 1
+# merge such rows that have the same taxa
-	dic = {}
+i = 1
+dic = {}
-	for l in biom_file[i:]:
-		for k in biom_file[i+1:]:
+for l in biom_file[i:]:
-			if l[0] == k[0]:
+if l[0] not in dic:
-				lst = []
+dic[l[0]] = l[1:]
-				j = 1
+for k in biom_file[i+1:]:
-				while j < len(l):
+if l[0] == k[0]:
-					lst.append(float(l[j]) + float(k[j]))
+lst = []
-					j += 1
+lstdic = dic[l[0]]
+j = 1
-				if l[0] in dic:
+while j < len(lstdic):
-					lst1 = dic[l[0]]
+lst.append(float(lstdic[j]) + float(k[j]))
-					j = 0
+j += 1
-					lst2 = []
+dic[l[0]] = lst
-					while j < len(lst1):
+i += 1
-						lst2.append(float(lst1[j]) + float(lst[j]))
-						j += 1
+feats = dict(dic)
-					lst = lst2
+if internal_levels:
+feats = add_missing_levels(feats)
-				dic[l[0]] = lst
-		# if not in dic, add it!
+for k in feats:
-		if l[0] not in dic:
+out.append('\t'.join([str(s) for s in [k] + feats[k]]))
-			dic[l[0]] = l[1:]
+return '\n'.join(out)
-		i += 1
-	for k in dic:
+def add_missing_levels(ff, summ=True):
-		out.append('\t'.join([str(s) for s in [k] + dic[k]]))
+"""
+Sum-up the internal abundances from leaf to root
-	return '\n'.join(out)
+"""
+if sum([f.count(".") for f in ff]) < 1:
+return ff
+clades2leaves = {}
+for f in ff:
+fs = f.split(".")
+if len(fs) < 2:
+continue
+for l in range(1, len(fs)+1):
+n = ".".join(fs[:l])
+if n in clades2leaves:
+clades2leaves[n].append(f)
+else:
+clades2leaves[n] = [f]
+ret = {}
+for k in clades2leaves:
+if summ:
+ret[k] = [sum([sum(ff[e]) for e in clades2leaves[k]])]
+else:
+lst = []
+for e in clades2leaves[k]:
+if not lst:
+for i in ff[e]:
+lst.append(i)
+else:
+lst1 = []
+i = 0
+while i < len(lst):
+lst1.append(lst[i] + ff[e][i])
+i += 1
+lst = lst1
+ret[k] = lst
+return ret
 def get_most_abundant(abundances, xxx):
-	"""
+"""
-	Sort by the abundance level all the taxonomy that represent at least two levels.
+Sort by the abundance level all the taxonomy that represent at least two levels.
-	Return the first ``xxx`` most abundant.
+Return the first ``xxx`` most abundant.
-	"""
+"""
-	abundant = []
+abundant = []
-	for a in abundances :
+for a in abundances:
-		if a.count('|') > 0:
+if a.count('|') > 0:
-			abundant.append((float(abundances[a]), a.replace('|', '.')))
+abundant.append((float(abundances[a]), a.replace('|', '.')))
-		elif a.count('.') > 0:
+elif a.count('.') > 0:
-			abundant.append((float(abundances[a]), a))
+abundant.append((float(abundances[a]), a))
-	abundant.sort(reverse=True)
+abundant.sort(reverse=True)
-	return abundant[:xxx]
+return abundant[:xxx]
 def get_biomarkes(abundant, xxx):
-	"""
+"""
-	Split the taxonomy and then look, level by level, when there are at least ``xxx`` distinct branches.
+Split the taxonomy and then look, level by level, when there are at least ``xxx`` distinct branches.
-	Return the set of branches as biomarkers to highlight.
+Return the set of branches as biomarkers to highlight.
-	"""
+"""
-	cc = []
+cc = []
-	bk = set()
+bk = set()
-	lvl = 0
+lvl = 0
-	for _, t in abundant:
+for _, t in abundant:
-		cc.append(t.split('.'))
+cc.append(t.split('.'))
-	while lvl < len(max(cc)):
+while lvl < len(max(cc)):
-		bk = set()
+bk = set()
-		for c in cc:
+for c in cc:
-			if lvl < len(c):
+if lvl < len(c):
-				bk |= set([c[lvl]])
+bk |= set([c[lvl]])
-		if len(bk) >= xxx:
+if len(bk) >= xxx:
-			break
+break
-		lvl += 1
+lvl += 1
-	return bk
+return bk
 def scale_clade_size(minn, maxx, abu, max_abu):
-	"""
+"""
-	Return the value of ``abu`` scaled to ``max_abu`` logarithmically, and then map from ``minn`` to ``maxx``.
+Return the value of ``abu`` scaled to ``max_abu`` logarithmically, and then map from ``minn`` to ``maxx``.
-	"""
+"""
-	return minn + maxx*log(1. + (abu/max_abu))
+return minn + maxx*log(1. + (abu/max_abu))
 def main():
-	"""
+"""
-	"""
+"""
-	colors = [(245., 90., 100.), (125., 80., 80.), (0., 80., 100.), (195., 100., 100.), (150., 100., 100.), (55., 100., 100.), (280., 80., 88.)] # HSV format
+colors = [(245., 90., 100.), (125., 80., 80.), (0., 80., 100.), (195., 100., 100.),
-	args = read_params()
+(150., 100., 100.), (55., 100., 100.), (280., 80., 88.)] # HSV format
-	lefse_input = None
+args = read_params()
-	lefse_output = {}
+lefse_input = None
-	color = {}
+lefse_output = {}
-	biomarkers = set()
+color = {}
-	taxa = []
+biomarkers = set()
-	abundances = {}
+taxa = []
-	max_abundances = None
+abundances = {}
-	max_effect_size = None
+max_abundances = None
-	max_log_effect_size = None
+max_effect_size = None
-	background_list = []
+max_log_effect_size = None
-	background_clades = []
+background_list = []
-	background_colors = {}
+background_clades = []
-	annotations_list = []
+background_colors = {}
-	external_annotations_list = []
+annotations_list = []
-	lin = False
+external_annotations_list = []
-	lout = False
+lin = False
+lout = False
-	# get the levels that should be shaded
-	if args.background_levels :
+# get the levels that should be shaded
-		background_list = [int(i.strip()) for i in args.background_levels.strip().split(',')]
+if args.background_levels:
+background_list = [int(i.strip()) for i in args.background_levels.strip().split(',')]
-	# get the background_clades
-	if args.background_clades :
+# get the background_clades
-		if get_file_type(args.background_colors) in ['txt'] :
+if args.background_clades:
-			with open(args.background_clades, 'r') as f:
+if get_file_type(args.background_colors) in ['txt']:
-				background_clades = [str(s.strip()) for s in f]
+with open(args.background_clades, 'r') as f:
-		else : # it's a string in csv format
+background_clades = [str(s.strip()) for s in f]
-			background_clades = [str(s.strip()) for s in args.background_clades.split(',')]
+else: # it's a string in csv format
+background_clades = [str(s.strip()) for s in args.background_clades.split(',')]
-	# read the set of colors to use for the background_clades
-	if args.background_colors :
+# read the set of colors to use for the background_clades
-		col = []
+if args.background_colors:
+col = []
-		if get_file_type(args.background_colors) in ['txt'] :
-			with open(args.background_colors, 'r') as f :
+if get_file_type(args.background_colors) in ['txt']:
-				col = [str(s.strip()) for s in f]
+with open(args.background_colors, 'r') as f:
-		else : # it's a string in csv format
+col = [str(s.strip()) for s in f]
-			col = [c.strip() for c in args.background_colors.split(',')]
+else: # it's a string in csv format
+col = [c.strip() for c in args.background_colors.split(',')]
-		lst = {}
-		i = 0
+lst = {}
+i = 0
-		for c in background_clades :
-			cc = c[:c.find('.')]
+for c in background_clades:
+cc = c[:c.find('.')]
-			if cc not in lst :
-				background_colors[c] = col[i % len(col)]
+if cc not in lst:
-				lst[cc] = col[i % len(col)]
+background_colors[c] = col[i % len(col)]
-				i += 1
+lst[cc] = col[i % len(col)]
-			else :
+i += 1
-				background_colors[c] = lst[cc]
+else:
+background_colors[c] = lst[cc]
-	# get the levels that will use the internal annotation
-	if args.annotations :
+# get the levels that will use the internal annotation
-		annotations_list = [int(i.strip()) for i in args.annotations.strip().split(',')]
+if args.annotations:
+annotations_list = [int(i.strip()) for i in args.annotations.strip().split(',')]
-	# get the levels that will use the external legend annotation
-	if args.external_annotations :
+# get the levels that will use the external legend annotation
-		external_annotations_list = [int(i.strip()) for i in args.external_annotations.strip().split(',')]
+if args.external_annotations:
+external_annotations_list = [int(i.strip()) for i in args.external_annotations.strip().split(',')]
-	if args.lefse_input :
-		# if the lefse_input is in biom format, convert it
+if args.lefse_input:
-		if get_file_type(args.lefse_input) in 'biom' :
+# if the lefse_input is in biom format, convert it
-			try :
+if get_file_type(args.lefse_input) in 'biom':
-				biom = parse_biom(args.lefse_input, args.discard_otus)
+try:
-				lefse_input = DataMatrix(StringIO(biom), args)
+biom = parse_biom(args.lefse_input, args.discard_otus, args.internal_levels)
-			except Exception as e :
+lefse_input = DataMatrix(StringIO(biom), args)
-				lin = True
+except Exception as e:
-				print e
+lin = True
-		else :
+print e
-			lefse_input = DataMatrix(args.lefse_input, args)
+else:
+if args.internal_levels:
-		if not lin :
+aaa = {}
-			taxa = [t.replace('|', '.') for t in lefse_input.get_fnames()] # build taxonomy list
+header = None
-			abundances = dict(lefse_input.get_averages())
+with open(args.lefse_input, 'r') as f:
-			max_abundances = max([abundances[x] for x in abundances])
+for r in f:
-	else : # no lefse_input provided
+if header is None:
-		lin = True
+header = [s.strip() for s in r.split('\t')]
+else:
-	if args.lefse_output :
+row = r.split('\t')
-		# if the lefse_output is in biom format... I don't think it's possible!
+aaa[row[0].strip().replace('|', '.')] = [float(s.strip()) for s in row[1:]]
-		if get_file_type(args.lefse_output) in 'biom' :
-			lout = True
+feats = add_missing_levels(aaa, summ=False)
-			print "Seriously?? LEfSe output file is not expected to be in biom format!"
+ss = '\t'.join(header)
-		else :
+ss += '\n'
-			lst = []
+ss += '\n'.join(['\t'.join([str(s) for s in [k] + feats[k]]) for k in feats])
+lefse_input = DataMatrix(StringIO(ss), args)
-			with open(args.lefse_output, 'r') as out_file :
+else:
-				for line in out_file :
+lefse_input = DataMatrix(args.lefse_input, args)
-					t, m, bk, es, pv = line.strip().split('\t')
-					lefse_output[t] = (es, bk, m, pv)
+if not lin:
+taxa = [t.replace('|', '.').strip() for t in lefse_input.get_fnames()] # build taxonomy list
-					# get distinct biomarkers
+abundances = dict(lefse_input.get_averages())
-					if bk :
+max_abundances = max([abundances[x] for x in abundances])
-						biomarkers |= set([bk])
+else: # no lefse_input provided
+lin = True
-					# get all effect size
-					if es :
+if args.lefse_output:
-						lst.append(float(es))
+# if the lefse_output is in biom format... I don't think it's possible!
+if get_file_type(args.lefse_output) in 'biom':
-				max_effect_size = max(lst)
+lout = True
+print "Seriously?? LEfSe output file is not expected to be in biom format!"
-			# no lefse_input file provided!
+else:
-			if (not taxa) and (not abundances) : # build taxonomy list and abundaces map
+lst = []
-				for t in lefse_output :
-					_, _, m, _ = lefse_output[t]
+with open(args.lefse_output, 'r') as out_file:
-					abundances[t.replace('.', '|')] = float(m)
+for line in out_file:
+t, m, bk, es, pv = line.strip().split('\t')
-				max_abundances = max([abundances[x] for x in abundances])
+lefse_output[t] = (es, bk, m, pv)
-				for t in lefse_output :
+# get distinct biomarkers
-					scaled = scale_clade_size(args.min_clade_size, args.max_clade_size, abundances[t.replace('.', '|')], max_abundances)
+if bk:
+biomarkers |= set([bk])
-					if scaled >= args.abundance_threshold :
-						taxa.append(t)
+# get all effect size
-	elif not lin : # no lefse_output provided and lefse_input correctly red
+if es:
-		lout = True
+lst.append(float(es))
-		# find the xxx most abundant
+max_effect_size = max(lst)
-		abundant = get_most_abundant(abundances, args.most_abundant)
+# no lefse_input file provided!
-		# find the taxonomy level with at least yyy distinct childs from the xxx most abundant
+if (not taxa) and (not abundances): # build taxonomy list and abundaces map
-		biomarkers = get_biomarkes(abundant, args.least_biomarkers)
+for t in lefse_output:
+_, _, m, _ = lefse_output[t]
-		# compose lefse_output variable
+abundances[t.replace('.', '|')] = float(m)
-		for _, t in abundant :
-			b = ''
+max_abundances = max([abundances[x] for x in abundances])
-			for bk in biomarkers :
+for t in lefse_output:
-				if bk in t :
+scaled = scale_clade_size(args.min_clade_size, args.max_clade_size,
-					b = bk
+abundances[t.replace('.', '|')], max_abundances)
-			lefse_output[t] = (2., b, '', '')
+if scaled >= args.abundance_threshold:
+taxa.append(t)
-		max_effect_size = 1. # It's not gonna working
+elif not lin: # no lefse_output provided and lefse_input correctly red
+lout = True
-	# no lefse_output and no lefse_input provided
+# find the xxx most abundant
-	if lin and lout :
+abundant = get_most_abundant(abundances, args.most_abundant)
-		print "You must provide at least one input file!"
-		exit(1)
+# find the taxonomy level with at least yyy distinct childs from the xxx most abundant
+biomarkers = get_biomarkes(abundant, args.least_biomarkers)
-	# write the tree
-	with open(args.tree, 'w') as tree_file :
+# compose lefse_output variable
-		for taxonomy in taxa :
+for _, t in abundant:
-			tree_file.write(''.join([taxonomy, '\n']))
+b = ''
-	# for each biomarker assign it to a different color
+for bk in biomarkers:
-	i = 0
+if bk in t:
+b = bk
-	for bk in biomarkers :
-		color[bk] = i % len(colors)
+lefse_output[t] = (2., b, '', '')
-		i += 1
+max_effect_size = 1. # It's not gonna working
-	# find max log abs value of effect size
+# no lefse_output and no lefse_input provided
-	if lefse_output :
+if lin and lout:
-		lst = []
+print "You must provide at least one input file!"
+exit(1)
-		for t in lefse_output :
-			es, _, _, _ = lefse_output[t]
+# write the tree
+with open(args.tree, 'w') as tree_file:
-			if es :
+tree_file.write('\n'.join(taxa))
-				lst.append(abs(log(float(es) / max_effect_size)))
+# for each biomarker assign it to a different color
-		max_log_effect_size = max(lst)
+i = 0
-	# write the annotation
+for bk in biomarkers:
-	try :
+color[bk] = i % len(colors)
-		with open(args.annotation, 'w') as annot_file :
+i += 1
-			# set the title
-			if args.title :
+# find max log abs value of effect size
-				annot_file.write(''.join(['\t'.join(['title', args.title]), '\n']))
+if lefse_output:
-				annot_file.write(''.join(['\t'.join(['title_font_size', str(args.title_font_size)]), '\n']))
+lst = []
-			# write some basic customizations
+for t in lefse_output:
-			annot_file.write(''.join(['\t'.join(['clade_separation', '0.5']), '\n']))
+es, _, _, _ = lefse_output[t]
-			annot_file.write(''.join(['\t'.join(['branch_bracket_depth', '0.8']), '\n']))
-			annot_file.write(''.join(['\t'.join(['branch_bracket_width', '0.2']), '\n']))
+if es:
-			annot_file.write(''.join(['\t'.join(['annotation_legend_font_size', str(args.annotation_legend_font_size)]), '\n']))
+lst.append(abs(log(float(es) / max_effect_size)))
-			annot_file.write(''.join(['\t'.join(['class_legend_font_size', '10']), '\n']))
-			annot_file.write(''.join(['\t'.join(['class_legend_marker_size', '1.5']), '\n']))
+max_log_effect_size = max(lst)
-			# write the biomarkers' legend
+# write the annotation
-			for bk in biomarkers :
+try:
-				biom = bk.replace('_', ' ').upper()
+with open(args.annotation, 'w') as annot_file:
-				rgb = scale_color(colors[color[bk]])
+# set the title
-				annot_file.write(''.join(['\t'.join([biom, 'annotation', biom]), '\n']))
+if args.title:
-				annot_file.write(''.join(['\t'.join([biom, 'clade_marker_color', rgb]), '\n']))
+annot_file.write('\n'.join(['\t'.join(['title', args.title]),
-				annot_file.write(''.join(['\t'.join([biom, 'clade_marker_size', '40']), '\n']))
+'\t'.join(['title_font_size', str(args.title_font_size)]), '\n']))
-			done_clades = []
+# write some basic customizations
+annot_file.write('\n'.join(['\t'.join(['clade_separation', '0.5']),
-			# write the annotation for the tree
+'\t'.join(['branch_bracket_depth', '0.8']),
-			for taxonomy in taxa :
+'\t'.join(['branch_bracket_width', '0.2']),
-				level = taxonomy.count('.') + 1 # which level is this taxonomy?
+'\t'.join(['annotation_legend_font_size', str(args.annotation_legend_font_size)]),
-				clean_taxonomy = taxonomy[taxonomy.rfind('.') + 1:] # retrieve the last level in taxonomy
+'\t'.join(['class_legend_font_size', '10']),
-				scaled = args.def_clade_size
+'\t'.join(['class_legend_marker_size', '1.5']), '\n']))
-				# scaled the size of the clade by the average abundance
+# write the biomarkers' legend
-				if taxonomy.replace('.', '|') in abundances :
+for bk in biomarkers:
-					scaled = scale_clade_size(args.min_clade_size, args.max_clade_size, abundances[taxonomy.replace('.', '|')], max_abundances)
+biom = bk.replace('_', ' ').upper()
+rgb = scale_color(colors[color[bk]])
-				annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_size', str(scaled)]), '\n']))
+annot_file.write('\n'.join(['\t'.join([biom, 'annotation', biom]),
+'\t'.join([biom, 'clade_marker_color', rgb]),
-				# put a bakcground annotation to the levels specified by the user
+'\t'.join([biom, 'clade_marker_size', '40']), '\n']))
-				shaded_background = []
+# write the annotation for the tree
-				for l in background_list :
+for taxonomy in taxa:
-					if level >= l :
+level = taxonomy.count('.') + 1 # which level is this taxonomy?
-						lst = [s.strip() for s in taxonomy.strip().split('.')]
+clean_taxonomy = taxonomy[taxonomy.rfind('.') + 1:] # retrieve the last level in taxonomy
-						t = '.'.join(lst[:l])
+cleanest_taxonomy = clean_taxonomy.replace('_', ' ') # substitute '_' with ' '
+scaled = args.def_clade_size
-						if t not in shaded_background :
-							shaded_background.append(t)
+# scaled the size of the clade by the average abundance
+if (taxonomy in abundances) or (taxonomy.replace('.', '|') in abundances):
-							font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / l)
+try:
+abu = abundances[taxonomy.replace('.', '|')]
-							annot_file.write(''.join(['\t'.join([t, 'annotation_background_color', args.background_color]), '\n']))
+except:
-							annot_file.write(''.join(['\t'.join([t, 'annotation', '*']), '\n']))
+abu = abundances[taxonomy]
-							annot_file.write(''.join(['\t'.join([t, 'annotation_font_size', str(font_size)]), '\n']))
+scaled = scale_clade_size(args.min_clade_size, args.max_clade_size, abu, max_abundances)
-				# put a bakcground annotation to the clades specified by the user
-				for c in background_colors :
+annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_size', str(scaled)]), '\n']))
-					bg_color = background_colors[c]
+# put a bakcground annotation to the levels specified by the user
-					if not bg_color.startswith('#') :
+shaded_background = []
-						bg_color = bg_color.replace('(', '').replace(')', '')
-						h, s, v = bg_color.split(';')
+for l in background_list:
-						bg_color = scale_color((float(h.strip()) , float(s.strip()), float(v.strip())))
+if level >= l:
+lst = [s.strip() for s in taxonomy.strip().split('.')]
-					# check if the taxonomy has more than one level
+t = '.'.join(lst[:l])
-					lvls = [str(cc.strip()) for cc in c.split('.')]
+if t not in shaded_background:
-					for l in lvls :
+shaded_background.append(t)
-						if (l in taxonomy) and (l not in done_clades) :
-							lvl = taxonomy[:taxonomy.index(l)].count('.') + 1
+font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / l)
-							font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / lvl)
+annot_file.write('\n'.join(['\t'.join([t, 'annotation_background_color', args.background_color]),
+'\t'.join([t, 'annotation', t.replace('_', ' ')]),
-							annot_file.write(''.join(['\t'.join([l, 'annotation_background_color', bg_color]), '\n']))
+'\t'.join([t, 'annotation_font_size', str(font_size)]), '\n']))
-							annot_file.write(''.join(['\t'.join([l, 'annotation', '*']), '\n']))
-							annot_file.write(''.join(['\t'.join([l, 'annotation_font_size', str(font_size)]), '\n']))
+# put a bakcground annotation to the clades specified by the user
+for c in background_colors:
-							done_clades.append(l)
+bg_color = background_colors[c]
-				if lefse_output :
+if not bg_color.startswith('#'):
-					if taxonomy in lefse_output :
+bg_color = bg_color.replace('(', '').replace(')', '')
-						es, bk, _, _ = lefse_output[taxonomy]
+h, s, v = bg_color.split(';')
+bg_color = scale_color((float(h.strip()) , float(s.strip()), float(v.strip())))
-						# if it is a biomarker then color and label it!
-						if bk :
+# check if the taxonomy has more than one level
-							fac = abs(log(float(es) / max_effect_size)) / max_log_effect_size
+lvls = [str(cc.strip()) for cc in c.split('.')]
+done_clades = []
-							try :
-								rgbs = scale_color(colors[color[bk]], fac)
+for l in lvls:
-							except Exception as e :
+if (l in taxonomy) and (l not in done_clades):
-								print e
+lvl = taxonomy[:taxonomy.index(l)].count('.') + 1
-								print ' '.join(["[W] Assign to", taxonomy, "the default color:", colors[color[bk]]])
+font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / lvl)
-								rgbs = colors[color[bk]]
+annot_file.write('\n'.join(['\t'.join([l, 'annotation_background_color', bg_color]),
-							annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_color', rgbs]), '\n']))
+'\t'.join([l, 'annotation', l.replace('_', ' ')]),
+'\t'.join([l, 'annotation_font_size', str(font_size)]), '\n']))
-							# write the annotation only if the abundance is above a given threshold and it is either internal or external annotation lists
-							if (scaled >= args.abundance_threshold) and ((level in annotations_list) or (level in external_annotations_list)) :
+done_clades.append(l)
-								font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / level)
-								annotation = '*' if level in annotations_list else '*:*'
+if lefse_output:
+if taxonomy in lefse_output:
-								annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation_background_color', rgbs]), '\n']))
+es, bk, _, _ = lefse_output[taxonomy]
-								annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation', annotation]), '\n']))
-								annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation_font_size', str(font_size)]), '\n']))
+# if it is a biomarker then color and label it!
-	except Exception as e :
+if bk:
-		print e
+fac = abs(log(float(es) / max_effect_size)) / max_log_effect_size
+try:
-if __name__ == '__main__' :
+rgbs = scale_color(colors[color[bk]], fac)
-	main()
+except Exception as e:
+print e
+print ' '.join(["[W] Assign to", taxonomy, "the default color:", colors[color[bk]]])
+rgbs = colors[color[bk]]
+annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_color', rgbs]), '\n']))
+# write the annotation only if the abundance is above a given threshold and it is either
+# internal or external annotation lists
+if (scaled >= args.abundance_threshold) and \
+((level in annotations_list) or (level in external_annotations_list)):
+font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / level)
+annotation = cleanest_taxonomy if level in annotations_list else '*:' + cleanest_taxonomy
+annot_file.write('\n'.join(['\t'.join([clean_taxonomy, 'annotation_background_color', rgbs]),
+'\t'.join([clean_taxonomy, 'annotation', annotation]),
+'\t'.join([clean_taxonomy, 'annotation_font_size', str(font_size)]), '\n']))
+except Exception as e:
+print e
+if __name__ == '__main__':
+main()

Mercurial > repos > george-weingart > graphlan_import

comparison export2graphlan.py @ 28:82fb838d02dc draft