changeset 0:cac6247cb1d3 draft

graphlan_import
author george-weingart
date Tue, 26 Aug 2014 14:51:29 -0400
parents
children 2c0d791fc950
files export2graphlan.py export2graphlan.xml hclust2/__init__.py hclust2/__init__.pyc hclust2/hclust2.py hclust2/hclust2.pyc merge_metaphlan.xml merge_metaphlan_tables.py
diffstat 7 files changed, 1575 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/export2graphlan.py	Tue Aug 26 14:51:29 2014 -0400
@@ -0,0 +1,609 @@
+#!/usr/bin/env python
+
+from argparse import ArgumentParser
+from colorsys import hsv_to_rgb
+from math import log
+from StringIO import StringIO
+from re import compile
+from hclust2.hclust2 import DataMatrix
+
+
+__author__ = 'Francesco Asnicar'
+__email__ = "francesco.asnicar@gmail.com"
+__version__ = '0.13'
+__date__ = '29th July 2014'
+
+
+def scale_color((h, s, v), factor=1.):
+	"""
+	Takes as input a tuple that represents a color in HSV format, and optionally a scale factor.
+	Return an RGB string that is the converted HSV color, scaled by the given factor.
+	"""
+	if (h < 0.) or (h > 360.):
+		raise Exception('[scale_color()] Hue value out of range (0, 360): ' + str(h))
+	
+	if (s < 0.) or (s > 100.):
+		raise Exception('[scale_color()] Saturation value out of range (0, 100): ' + str(s))
+
+	if (v < 0.) or (v > 100.):
+		raise Exception('[scale_color()] Value value out of range (0, 100): ' + str(v))
+
+	if (factor < 0.) or (factor > 1.):
+		raise Exception('[scale_color()] Factor value out of range (0.0, 1.0): ' + str(factor))
+
+	v *= factor
+	r, g, b = hsv_to_rgb(h/360., s/100., v/100.)
+
+	return '#{0:02x}{1:02x}{2:02x}'.format(int(round(r*255.)), int(round(g*255.)), int(round(b*255.)))
+
+
+def read_params():
+	"""
+	Parse the input parameters, performing some validity check.
+	Return the parsed arguments.
+	"""
+	parser = ArgumentParser(description="export2graphlan.py (ver. " + __version__ +
+		" of " + __date__ + "). Convert MetaPhlAn, LEfSe, and/or HUMAnN output to GraPhlAn input format. Authors: "
+		+ __author__ + " (" + __email__ + ")")
+	
+	# input parameters group
+	group = parser.add_argument_group(title='input parameters',
+		description="You need to provide at least one of the two arguments")
+	group.add_argument('-i', '--lefse_input',
+		type=str,
+		required=False,
+		help="LEfSe input data")
+	group.add_argument('-o', '--lefse_output',
+		type=str,
+		required=False,
+		help="LEfSe output result data")
+
+	# output parameters group
+	group = parser.add_argument_group(title='output parameters')
+	group.add_argument('-t', '--tree',
+		type=str,
+		required=True,
+		help="Output filename where save the input tree for GraPhlAn")
+	group.add_argument('-a', '--annotation',
+		type=str,
+		required=True,
+		help="Output filename where save GraPhlAn annotation")
+
+	# annotations
+	parser.add_argument('--annotations',
+		default=None,
+		type=str,
+		required=False,
+		help="List which levels should be annotated in the tree. Use a comma separate values form, e.g., --annotation_levels 1,2,3. Default is None")
+	parser.add_argument('--external_annotations',
+		default=None,
+		type=str,
+		required=False,
+		help="List which levels should use the external legend for the annotation. Use a comma separate values form, e.g., --annotation_levels 1,2,3. Default is None")
+	# shaded background
+	parser.add_argument('--background_levels',
+		default=None,
+		type=str,
+		required=False,
+		help="List which levels should be highlight with a shaded background. Use a comma separate values form, e.g., --background_levels 1,2,3")
+	parser.add_argument('--background_clades',
+		default=None,
+		type=str,
+		required=False,
+		help="Specify the clades that should be highlight with a shaded background. Use a comma separate values form and surround the string with \" if it contains spaces. Example: --background_clades \"Bacteria.Actinobacteria, Bacteria.Bacteroidetes.Bacteroidia, Bacteria.Firmicutes.Clostridia.Clostridiales\"")
+	parser.add_argument('--background_colors',
+		default=None,
+		type=str,
+		required=False,
+		help="Set the color to use for the shaded background. Colors can be either in RGB or HSV (using a semi-colon to separate values, surrounded with ()) format. Use a comma separate values form and surround the string with \" if it contains spaces. Example: --background_colors \"#29cc36, (150; 100; 100), (280; 80; 88)\"")
+	# title
+	parser.add_argument('--title',
+		type=str,
+		required=False,
+		help="If specified set the title of the GraPhlAn plot. Surround the string with \" if it contains spaces, e.g., --title \"Title example\"")
+	# title font size
+	parser.add_argument('--title_font_size',
+		default=15,
+		type=int,
+		required=False,
+		help="Set the title font size. Default is 15")
+	# clade size
+	parser.add_argument('--def_clade_size',
+		default=10.,
+		type=float,
+		required=False,
+		help="Set a default size for clades that are not found as biomarkers by LEfSe. Default is 10")
+	parser.add_argument('--min_clade_size',
+		default=20.,
+		type=float,
+		required=False,
+		help="Set the minimum value of clades that are biomarkers. Default is 20")
+	parser.add_argument('--max_clade_size',
+		default=200.,
+		type=float,
+		required=False,
+		help="Set the maximum value of clades that are biomarkers. Default is 200")
+	# font size
+	parser.add_argument('--def_font_size',
+		default=10,
+		type=int,
+		required=False,
+		help="Set a default font size. Default is 10")
+	parser.add_argument('--min_font_size',
+		default=8,
+		type=int,
+		required=False,
+		help="Set the minimum font size to use. Default is 8")
+	parser.add_argument('--max_font_size',
+		default=12,
+		type=int,
+		required=False,
+		help="Set the maximum font size. Default is 12")
+	# legend font size
+	parser.add_argument('--annotation_legend_font_size',
+		default=10,
+		type=int,
+		required=False,
+		help="Set the font size for the annotation legend. Default is 10")
+	# abundance threshold
+	parser.add_argument('--abundance_threshold',
+		default=20.,
+		type=float,
+		required=False,
+		help="Set the minimun abundace value for a clade to be annotated. Default is 20.0")
+	# ONLY lefse_input provided
+	parser.add_argument('--most_abundant',
+		default=10,
+		type=int,
+		required=False,
+		help="When only lefse_input is provided, you can specify how many clades highlight. Since the biomarkers are missing, they will be chosen from the most abundant")
+	parser.add_argument('--least_biomarkers',
+		default=3,
+		type=int,
+		required=False,
+		help="When only lefse_input is provided, you can specify the minimum number of biomarkers to extract. The taxonomy is parsed, and the level is choosen in order to have at least the specified number of biomarkers")
+	# decide to keep the OTU id or to merger at the above taxonomic level
+	parser.add_argument('--discard_otus',
+		default=True,
+		action='store_false',
+		help="If specified the OTU ids will be discarde from the taxonmy. Default behavior keep OTU ids in taxonomy")
+
+	DataMatrix.input_parameters(parser)
+	args = parser.parse_args()
+
+	# check if at least one of the input params is given
+	if (not args.lefse_input) and (not args.lefse_output) :
+		raise Exception("[read_params()] You must provide at least one of the two input parameters: ")
+
+	# check that min_clade_size is less than max_clade_size
+	if args.min_clade_size > args.max_clade_size :
+		print "[W] min_clade_size cannot be greater than max_clade_size, assigning their default values"
+		args.min_clade_size = 20.
+		args.max_clade_size = 200.
+	
+	# check that min_font_size is less than max_font_size
+	if args.min_font_size > args.max_font_size :
+		print "[W] min_font_size cannot be greater than max_font_size, assigning their default values"
+		args.min_font_size = 8
+		args.max_font_size = 12
+	
+	return args
+
+
+def get_file_type(filename):
+	"""
+	Return the extension (if any) of the ``filename`` in lower case.
+	"""
+	return filename[filename.rfind('.')+1:].lower()
+
+
+def parse_biom(filename, keep_otus=True):
+	"""
+	Load a biom table and extract the taxonomy (from metadata), removing the unuseful header.
+	Return the input biom in tab-separated format.
+	"""
+	from biom import load_table # avoid to ask for the BIOM library if there is no biom file
+
+	biom_table = load_table(filename)
+	strs = biom_table.delimited_self(header_value='TAXA', header_key='taxonomy')
+	lst1 = [str(s) for s in strs.split('\n')[1:]] # skip the "# Constructed from biom file" entry
+	biom_file = []
+	out = [lst1[0]] # save the header
+	pre_taxa = compile(".__")
+	classs = compile("\(class\)")
+
+	# consistency check
+	i = 0
+	while i < (len(lst1)-1):
+		if len([s for s in lst1[i].split('\t')]) != len([s for s in lst1[i+1].split('\t')]) :
+			raise Exception('[parse_biom()] It seems that taxonomic metadata are missing, maybe is the wrong biom file?')
+
+		i += 1
+
+	for l in lst1[1:]:
+		otu = None
+		lst = [str(s).strip() for s in l.split('\t')[1:-1]]
+
+		if keep_otus:
+			otu = l.split('\t')[0]
+
+		# Clean an move taxa in first place
+		taxa = '.'.join([s.strip().replace('[', '').replace('u\'', '').replace(']', '').replace(' ', '').replace('\'', '') for s in l.split('\t')[-1].split(',')])
+		taxa = pre_taxa.sub('', taxa) # remove '{k|p|o|g|s}__'
+		taxa = classs.sub('', taxa) # remove '(class)'
+		taxa = taxa.rstrip('.') # remove trailing dots
+
+		if otu:
+			taxa = taxa + '.' + otu
+
+		biom_file.append([taxa] + lst)
+
+	# merge such rows that have the same taxa
+	i = 1
+	dic = {}
+
+	for l in biom_file[i:]:
+		for k in biom_file[i+1:]:
+			if l[0] == k[0]:
+				lst = []
+				j = 1
+
+				while j < len(l):
+					lst.append(float(l[j]) + float(k[j]))
+					j += 1
+
+				if l[0] in dic:
+					lst1 = dic[l[0]]
+					j = 0
+					lst2 = []
+
+					while j < len(lst1):
+						lst2.append(float(lst1[j]) + float(lst[j]))
+						j += 1
+
+					lst = lst2
+
+				dic[l[0]] = lst
+		# if not in dic, add it!
+		if l[0] not in dic:
+			dic[l[0]] = l[1:]
+
+		i += 1
+
+	for k in dic:
+		out.append('\t'.join([str(s) for s in [k] + dic[k]]))
+	
+	return '\n'.join(out)
+
+
+def get_most_abundant(abundances, xxx):
+	"""
+	Sort by the abundance level all the taxonomy that represent at least two levels.
+	Return the first ``xxx`` most abundant.
+	"""
+	abundant = []
+
+	for a in abundances :
+		if a.count('|') > 0:
+			abundant.append((float(abundances[a]), a.replace('|', '.')))
+		elif a.count('.') > 0:
+			abundant.append((float(abundances[a]), a))
+
+	abundant.sort(reverse=True)
+	return abundant[:xxx]
+
+
+def get_biomarkes(abundant, xxx):
+	"""
+	Split the taxonomy and then look, level by level, when there are at least ``xxx`` distinct branches.
+	Return the set of branches as biomarkers to highlight.
+	"""
+	cc = []
+	bk = set()
+	lvl = 0
+
+	for _, t in abundant:
+		cc.append(t.split('.'))
+
+	while lvl < len(max(cc)):
+		bk = set()
+
+		for c in cc:
+			if lvl < len(c):
+				bk |= set([c[lvl]])
+
+		if len(bk) >= xxx:
+			break
+
+		lvl += 1
+
+	return bk
+
+def scale_clade_size(minn, maxx, abu, max_abu):
+	"""
+	Return the value of ``abu`` scaled to ``max_abu`` logarithmically, and then map from ``minn`` to ``maxx``.
+	"""
+	return minn + maxx*log(1. + (abu/max_abu))
+
+
+def main():
+	"""
+	"""
+	colors = [(245., 90., 100.), (125., 80., 80.), (0., 80., 100.), (195., 100., 100.), (150., 100., 100.), (55., 100., 100.), (280., 80., 88.)] # HSV format
+	args = read_params()
+	lefse_input = None
+	lefse_output = {}
+	color = {}
+	biomarkers = set()
+	taxa = []
+	abundances = {}
+	max_abundances = None
+	max_effect_size = None
+	max_log_effect_size = None
+	background_list = []
+	background_clades = []
+	background_colors = {}
+	annotations_list = []
+	external_annotations_list = []
+	lin = False
+	lout = False
+
+	# get the levels that should be shaded
+	if args.background_levels :
+		background_list = [int(i.strip()) for i in args.background_levels.strip().split(',')]
+
+	# get the background_clades
+	if args.background_clades :
+		if get_file_type(args.background_colors) in ['txt'] :
+			with open(args.background_clades, 'r') as f:
+				background_clades = [str(s.strip()) for s in f]
+		else : # it's a string in csv format
+			background_clades = [str(s.strip()) for s in args.background_clades.split(',')]
+
+	# read the set of colors to use for the background_clades
+	if args.background_colors :
+		col = []
+
+		if get_file_type(args.background_colors) in ['txt'] :
+			with open(args.background_colors, 'r') as f :
+				col = [str(s.strip()) for s in f]
+		else : # it's a string in csv format
+			col = [c.strip() for c in args.background_colors.split(',')]
+
+		lst = {}
+		i = 0
+
+		for c in background_clades :
+			cc = c[:c.find('.')]
+
+			if cc not in lst :
+				background_colors[c] = col[i % len(col)]
+				lst[cc] = col[i % len(col)]
+				i += 1
+			else :
+				background_colors[c] = lst[cc]
+
+	# get the levels that will use the internal annotation
+	if args.annotations :
+		annotations_list = [int(i.strip()) for i in args.annotations.strip().split(',')]
+
+	# get the levels that will use the external legend annotation
+	if args.external_annotations :
+		external_annotations_list = [int(i.strip()) for i in args.external_annotations.strip().split(',')]
+
+	if args.lefse_input :
+		# if the lefse_input is in biom format, convert it
+		if get_file_type(args.lefse_input) in 'biom' :
+			try :
+				biom = parse_biom(args.lefse_input, args.discard_otus)
+				lefse_input = DataMatrix(StringIO(biom), args)
+			except Exception as e :
+				lin = True
+				print e
+		else :
+			lefse_input = DataMatrix(args.lefse_input, args)
+		
+		if not lin :
+			taxa = [t.replace('|', '.') for t in lefse_input.get_fnames()] # build taxonomy list
+			abundances = dict(lefse_input.get_averages())
+			max_abundances = max([abundances[x] for x in abundances])
+	else : # no lefse_input provided
+		lin = True
+
+	if args.lefse_output :
+		# if the lefse_output is in biom format... I don't think it's possible!
+		if get_file_type(args.lefse_output) in 'biom' :
+			lout = True
+			print "Seriously?? LEfSe output file is not expected to be in biom format!"
+		else :
+			lst = []
+
+			with open(args.lefse_output, 'r') as out_file :
+				for line in out_file :
+					t, m, bk, es, pv = line.strip().split('\t')
+					lefse_output[t] = (es, bk, m, pv)
+
+					# get distinct biomarkers
+					if bk :
+						biomarkers |= set([bk])
+
+					# get all effect size
+					if es :
+						lst.append(float(es))
+
+				max_effect_size = max(lst)
+
+			# no lefse_input file provided!
+			if (not taxa) and (not abundances) : # build taxonomy list and abundaces map
+				for t in lefse_output :
+					_, _, m, _ = lefse_output[t]
+					abundances[t.replace('.', '|')] = float(m)
+
+				max_abundances = max([abundances[x] for x in abundances])
+
+				for t in lefse_output :
+					scaled = scale_clade_size(args.min_clade_size, args.max_clade_size, abundances[t.replace('.', '|')], max_abundances)
+
+					if scaled >= args.abundance_threshold :
+						taxa.append(t)
+	elif not lin : # no lefse_output provided and lefse_input correctly red
+		lout = True
+		
+		# find the xxx most abundant
+		abundant = get_most_abundant(abundances, args.most_abundant)
+
+		# find the taxonomy level with at least yyy distinct childs from the xxx most abundant
+		biomarkers = get_biomarkes(abundant, args.least_biomarkers)
+
+		# compose lefse_output variable
+		for _, t in abundant :
+			b = ''
+
+			for bk in biomarkers :
+				if bk in t :
+					b = bk
+
+			lefse_output[t] = (2., b, '', '')
+
+		max_effect_size = 1. # It's not gonna working
+	
+
+	# no lefse_output and no lefse_input provided
+	if lin and lout :
+		print "You must provide at least one input file!"
+		exit(1)
+
+	# write the tree
+	with open(args.tree, 'w') as tree_file :
+		for taxonomy in taxa :
+			tree_file.write(''.join([taxonomy, '\n']))
+
+	# for each biomarker assign it to a different color
+	i = 0
+
+	for bk in biomarkers :
+		color[bk] = i % len(colors)
+		i += 1
+
+	# find max log abs value of effect size
+	if lefse_output :
+		lst = []
+
+		for t in lefse_output :
+			es, _, _, _ = lefse_output[t]
+			
+			if es :
+				lst.append(abs(log(float(es) / max_effect_size)))
+
+		max_log_effect_size = max(lst)
+
+	# write the annotation
+	try :
+		with open(args.annotation, 'w') as annot_file :
+			# set the title
+			if args.title :
+				annot_file.write(''.join(['\t'.join(['title', args.title]), '\n']))
+				annot_file.write(''.join(['\t'.join(['title_font_size', str(args.title_font_size)]), '\n']))
+
+			# write some basic customizations
+			annot_file.write(''.join(['\t'.join(['clade_separation', '0.5']), '\n']))
+			annot_file.write(''.join(['\t'.join(['branch_bracket_depth', '0.8']), '\n']))
+			annot_file.write(''.join(['\t'.join(['branch_bracket_width', '0.2']), '\n']))
+			annot_file.write(''.join(['\t'.join(['annotation_legend_font_size', str(args.annotation_legend_font_size)]), '\n']))
+			annot_file.write(''.join(['\t'.join(['class_legend_font_size', '10']), '\n']))
+			annot_file.write(''.join(['\t'.join(['class_legend_marker_size', '1.5']), '\n']))
+
+			# write the biomarkers' legend
+			for bk in biomarkers :
+				biom = bk.replace('_', ' ').upper()
+				rgb = scale_color(colors[color[bk]])
+				annot_file.write(''.join(['\t'.join([biom, 'annotation', biom]), '\n']))
+				annot_file.write(''.join(['\t'.join([biom, 'clade_marker_color', rgb]), '\n']))
+				annot_file.write(''.join(['\t'.join([biom, 'clade_marker_size', '40']), '\n']))
+
+			done_clades = []
+
+			# write the annotation for the tree
+			for taxonomy in taxa :
+				level = taxonomy.count('.') + 1 # which level is this taxonomy?
+				clean_taxonomy = taxonomy[taxonomy.rfind('.') + 1:] # retrieve the last level in taxonomy
+				scaled = args.def_clade_size
+
+				# scaled the size of the clade by the average abundance
+				if taxonomy.replace('.', '|') in abundances :
+					scaled = scale_clade_size(args.min_clade_size, args.max_clade_size, abundances[taxonomy.replace('.', '|')], max_abundances)
+				
+				annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_size', str(scaled)]), '\n']))
+
+				# put a bakcground annotation to the levels specified by the user
+				shaded_background = []
+
+				for l in background_list :
+					if level >= l :
+						lst = [s.strip() for s in taxonomy.strip().split('.')]
+						t = '.'.join(lst[:l])
+
+						if t not in shaded_background :
+							shaded_background.append(t)
+
+							font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / l)
+
+							annot_file.write(''.join(['\t'.join([t, 'annotation_background_color', args.background_color]), '\n']))
+							annot_file.write(''.join(['\t'.join([t, 'annotation', '*']), '\n']))
+							annot_file.write(''.join(['\t'.join([t, 'annotation_font_size', str(font_size)]), '\n']))
+
+				# put a bakcground annotation to the clades specified by the user
+				for c in background_colors :
+					bg_color = background_colors[c]
+
+					if not bg_color.startswith('#') :
+						bg_color = bg_color.replace('(', '').replace(')', '')
+						h, s, v = bg_color.split(';')
+						bg_color = scale_color((float(h.strip()) , float(s.strip()), float(v.strip())))
+
+					# check if the taxonomy has more than one level
+					lvls = [str(cc.strip()) for cc in c.split('.')]
+
+					for l in lvls :
+						if (l in taxonomy) and (l not in done_clades) :
+							lvl = taxonomy[:taxonomy.index(l)].count('.') + 1
+							font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / lvl)
+							
+
+							annot_file.write(''.join(['\t'.join([l, 'annotation_background_color', bg_color]), '\n']))
+							annot_file.write(''.join(['\t'.join([l, 'annotation', '*']), '\n']))
+							annot_file.write(''.join(['\t'.join([l, 'annotation_font_size', str(font_size)]), '\n']))
+
+							done_clades.append(l)
+
+				if lefse_output :
+					if taxonomy in lefse_output :
+						es, bk, _, _ = lefse_output[taxonomy]
+
+						# if it is a biomarker then color and label it!
+						if bk :
+							fac = abs(log(float(es) / max_effect_size)) / max_log_effect_size
+							
+							try :
+								rgbs = scale_color(colors[color[bk]], fac)
+							except Exception as e :
+								print e
+								print ' '.join(["[W] Assign to", taxonomy, "the default color:", colors[color[bk]]])
+								rgbs = colors[color[bk]]
+
+							annot_file.write(''.join(['\t'.join([clean_taxonomy, 'clade_marker_color', rgbs]), '\n']))
+
+							# write the annotation only if the abundance is above a given threshold and it is either internal or external annotation lists
+							if (scaled >= args.abundance_threshold) and ((level in annotations_list) or (level in external_annotations_list)) :
+								font_size = args.min_font_size + ((args.max_font_size - args.min_font_size) / level)
+								annotation = '*' if level in annotations_list else '*:*'
+
+								annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation_background_color', rgbs]), '\n']))
+								annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation', annotation]), '\n']))
+								annot_file.write(''.join(['\t'.join([clean_taxonomy, 'annotation_font_size', str(font_size)]), '\n']))
+	except Exception as e :
+		print e
+
+
+if __name__ == '__main__' :
+	main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/export2graphlan.xml	Tue Aug 26 14:51:29 2014 -0400
@@ -0,0 +1,85 @@
+<tool id="export2graphlan" name="export2graphlan" version="1.0.0">
+  <description>Export to Graphlan</description>
+  <command  interpreter="python">
+    export2graphlan.py 
+	-i $inp_data 
+	-o $out_data
+	-t $output_tree_file 
+	-a $output_annot_file 
+	--title $export_title
+	--annotations $export_annotations 
+	--external_annotations $export_external_annotations
+	--skip_rows 1,2 
+   </command>
+   
+	<inputs>
+	    <param name="export_title" type="text" format="text" label="Title" value="Title"/>
+	    <param name="export_annotations" type="text" format="text" label="Annotations" value="2,3"/>
+	    <param name="export_external_annotations" type="text" format="text" label="External Annotations" value="4,5,6"/>   
+		<param format="tabular" name="inp_data" type="data" label="Input used to run Lefse -  See samples below - Please use Galaxy Get-Data/Upload-File. Use File-Type = Tabular" 	help="This is the file that was used as input for Lefse"/>
+		<param format="lefse_res" name="out_data" type="data" label="Output of  Lefse"  help="This is the  Lefse output file"/>	
+    </inputs>
+	<outputs>
+            <data  name="output_annot_file"  format="circl"  />
+            <data  name="output_tree_file"  format="circl"  />
+	</outputs>
+                                  
+  <help>
+**export2graphlan** is an automatic conversion script from **LEfSe**, **MetaPhlAn2**, and **HUMAnN** input and/or output files, to **GraPhlAn**. Input file can be also given in BIOM 2.0 format.
+
+The aim of this tool is to support biologists, helping them by automatically write the tree and the annotation file for **GraPhlAn**.
+
+----
+
+.. contents::
+
+----
+
+Overview
+========
+
+A graphical representation of how **export2graphlan** can be used in the analysis pipeline:
+
+.. image:: https://bitbucket.org/repo/oL6bEG/images/3364692296-graphlan_integration.png
+    :height: 500   
+    :width: 600 
+ 
+
+----
+
+HMP aerobiosis
+==============
+
+A taxonomic tree that shows three different classes of oxygen (low, medium, and high), highlighting biomarker clades for each class. Data are taken from the HMP project.
+
+.. image:: https://bitbucket.org/repo/oL6bEG/images/2487320460-hmp_aerobiosis.png
+    :height: 500   
+    :width: 600 
+ 
+
+Pipeline
+--------
+
+ 
+    # download the data
+    $ wget http://huttenhower.sph.harvard.edu/webfm_send/129 -O hmp_aerobiosis_small.txt
+
+    # convert the file in LEfSe format, and run LEfSe
+    $ format_input.py hmp_aerobiosis_small.txt hmp_aerobiosis_small.in -c 1 -s 2 -u 3 -o 1000000
+    $ run_lefse.py hmp_aerobiosis_small.in hmp_aerobiosis_small.res
+
+    # convert it!
+    $ export2graphlan.py -i hmp_aerobiosis_small.txt -o hmp_aerobiosis_small.res -t tree.txt -a annot.txt --title "HMP aerobiosis" --annotations 2,3 --external_annotations 4,5,6 --fname_row 0 --skip_rows 1,2 --ftop 200
+
+    # attach annotation to the tree
+    $ graphlan_annotate.py --annot annot.txt tree.txt outtree.txt
+
+    # generate the beautiful image
+    $ graphlan.py --dpi 300 --size 7.0 outtree.txt outimg.png --external_legends
+
+The input file is downloaded from The Huttenhower Lab  and given to **LEfSe** for biomarkers discovery. The two file (the *LEfSe input* and the *LEfSe output* files) are then passed to **export2graphlan**. In this case the levels 2 (*phylum*) and 3 (*class*) are annotated on the circular tree, while levels 4 (*order*), 5 (*family*), and 6 (*genus*) are put on the external legend.
+ 
+
+
+  </help>
+</tool>
Binary file hclust2/__init__.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hclust2/hclust2.py	Tue Aug 26 14:51:29 2014 -0400
@@ -0,0 +1,758 @@
+#!/usr/bin/env python
+
+import sys
+import numpy as np
+import matplotlib.ticker as ticker
+import scipy.spatial.distance as spd 
+import scipy.cluster.hierarchy as sph
+from scipy import stats
+import matplotlib
+#matplotlib.use('Agg')
+import pylab
+import pandas as pd
+from matplotlib.patches import Rectangle
+from mpl_toolkits.axes_grid1 import make_axes_locatable
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import cPickle as pickle
+
+sys.setrecursionlimit(10000)
+
+# samples on rows
+
+class SqrtNorm(matplotlib.colors.Normalize):
+    """
+    Normalize a given value to the 0-1 range on a square root scale
+    """
+    def __call__(self, value, clip=None):
+        if clip is None:
+            clip = self.clip
+
+        result, is_scalar = self.process_value(value)
+
+        result = np.ma.masked_less_equal(result, 0, copy=False)
+
+        self.autoscale_None(result)
+        vmin, vmax = self.vmin, self.vmax
+        if vmin > vmax:
+            raise ValueError("minvalue must be less than or equal to maxvalue")
+        elif vmin <= 0:
+            raise ValueError("values must all be positive")
+        elif vmin == vmax:
+            result.fill(0)
+        else:
+            if clip:
+                mask = np.ma.getmask(result)
+                result = np.ma.array(np.clip(result.filled(vmax), vmin, vmax),
+                                  mask=mask)
+            # in-place equivalent of above can be much faster
+            resdat = result.data
+            mask = result.mask
+            if mask is np.ma.nomask:
+                mask = (resdat <= 0)
+            else:
+                mask |= resdat <= 0
+            matplotlib.cbook._putmask(resdat, mask, 1)
+            np.sqrt(resdat, resdat)
+            resdat -= np.sqrt(vmin)
+            resdat /= (np.sqrt(vmax) - np.sqrt(vmin))
+            result = np.ma.array(resdat, mask=mask, copy=False)
+        if is_scalar:
+            result = result[0]
+        return result
+
+    def inverse(self, value):
+        if not self.scaled():
+            raise ValueError("Not invertible until scaled")
+        vmin, vmax = self.vmin, self.vmax
+
+        if matplotlib.cbook.iterable(value):
+            val = np.ma.asarray(value)
+            return vmin * np.ma.power((vmax / vmin), val)
+        else:
+            return vmin * pow((vmax / vmin), value)
+
+    def autoscale(self, A):
+        '''
+        Set *vmin*, *vmax* to min, max of *A*.
+        '''
+        A = np.ma.masked_less_equal(A, 0, copy=False)
+        self.vmin = np.ma.min(A)
+        self.vmax = np.ma.max(A)
+
+    def autoscale_None(self, A):
+        ' autoscale only None-valued vmin or vmax'
+        if self.vmin is not None and self.vmax is not None:
+            return
+        A = np.ma.masked_less_equal(A, 0, copy=False)
+        if self.vmin is None:
+            self.vmin = np.ma.min(A)
+        if self.vmax is None:
+            self.vmax = np.ma.max(A)
+
+class DataMatrix:
+    datatype = 'data_matrix'
+    
+    @staticmethod
+    def input_parameters( parser ):
+        dm_param = parser.add_argument_group('Input data matrix parameters')
+        arg = dm_param.add_argument
+
+        arg( '--sep', type=str, default='\t' )
+        arg( '--out_table', type=str, default=None,
+             help = 'Write processed data matrix to file' )
+        arg( '--fname_row', type=int, default=0,
+             help = "row number containing the names of the features "
+                    "[default 0, specify -1 if no names are present in the matrix")
+        arg( '--sname_row', type=int, default=0,
+             help = "column number containing the names of the samples "
+                    "[default 0, specify -1 if no names are present in the matrix")
+        arg( '--metadata_rows', type=str, default=None,
+             help = "Row numbers to use as metadata"
+                    "[default None, meaning no metadata")
+        arg( '--skip_rows', type=str, default=None,
+             help = "Row numbers to skip (0-indexed, comma separated) from the input file"
+                    "[default None, meaning no rows skipped")
+        arg( '--sperc', type=int, default=90, 
+             help = "Percentile of sample value distribution for sample selection" )
+        arg( '--fperc', type=int, default=90, 
+             help = "Percentile of feature value distribution for sample selection" )
+        arg( '--stop', type=int, default=None, 
+             help = "Number of top samples to select (ordering based on percentile specified by --sperc)" )
+        arg( '--ftop', type=int, default=None, 
+             help = "Number of top features to select (ordering based on percentile specified by --fperc)" )
+        arg( '--def_na', type=float, default=None,
+             help = "Set the default value for missing values [default None which means no replacement]")
+
+    def __init__( self, input_file, args ):
+        self.args = args
+        self.metadata_rows =  [] 
+        self.metadata_table = None
+        toskip = [int(l) for l in self.args.skip_rows.split(",")]  if self.args.skip_rows else [] 
+        if self.args.metadata_rows:
+            self.metadata_rows = list([int(a) for a in self.args.metadata_rows.split(",")])
+            mdr = self.metadata_rows[::]
+            for t in toskip:
+                for i,m in enumerate(mdr):
+                    if t <= m:
+                        self.metadata_rows[i] -= 1
+        if self.metadata_rows:
+            header = [self.args.fname_row]+self.metadata_rows if self.args.fname_row > -1 else self.metadata_rows
+        else:
+            header = self.args.fname_row if self.args.fname_row > -1 else None        
+        self.table = pd.read_table( 
+                input_file, sep = self.args.sep, # skipinitialspace = True, 
+                                  skiprows = sorted(toskip) if isinstance(toskip, list) else toskip,
+                                  header = sorted(header) if isinstance(header, list) else header,
+                                  index_col = self.args.sname_row if self.args.sname_row > -1 else None
+                                    )
+        
+        def select( perc, top  ): 
+            self.table['perc'] = self.table.apply(lambda x: stats.scoreatpercentile(x,perc),axis=1)
+            m = sorted(self.table['perc'])[-top]
+            self.table = self.table[self.table['perc'] >= m ]
+            del self.table['perc'] 
+        
+        if not self.args.def_na is None:
+            self.table = self.table.fillna( self.args.def_na )
+
+        if self.args.ftop:
+            select( self.args.fperc, self.args.ftop )
+        
+        if self.args.stop:
+            self.table = self.table.T 
+            select( self.args.sperc, self.args.stop ) 
+            self.table = self.table.T
+        
+
+        # add missing values
+        
+    def get_numpy_matrix( self ): 
+        return np.matrix(self.table)
+    
+    #def get_metadata_matrix( self ):
+    #    return self.table.columns
+    
+    def get_snames( self ):
+        #return list(self.table.index)
+        return self.table.columns
+    
+    def get_fnames( self ):
+        #print self.table.columns.names
+        #print self.table.columns
+        return list(self.table.index)
+    
+    def get_averages(self, by_row = True) :
+        return self.table.mean(axis = 1 if by_row else 0)
+    
+    def save_matrix( self, output_file ):
+        self.table.to_csv( output_file, sep = '\t' )
+
+class DistMatrix:
+    datatype = 'distance_matrix'
+
+    @staticmethod
+    def input_parameters( parser ):
+        dm_param = parser.add_argument_group('Distance parameters')
+        arg = dm_param.add_argument
+
+        dist_funcs = [  "euclidean","minkowski","cityblock","seuclidean",
+                        "sqeuclidean","cosine","correlation","hamming",
+                        "jaccard","chebyshev","canberra","braycurtis",
+                        "mahalanobis","yule","matching","dice",
+                        "kulsinski","rogerstanimoto","russellrao","sokalmichener",
+                        "sokalsneath","wminkowski","ward" ]
+
+        arg( '--f_dist_f', type=str, default="correlation",
+             help = "Distance function for features [default correlation]")
+        arg( '--s_dist_f', type=str, default="euclidean",
+             help = "Distance function for sample [default euclidean]")
+        arg( '--load_dist_matrix_f', type=str, default=None,
+             help = "Load the distance matrix to be used for features [default None].")
+        arg( '--load_dist_matrix_s', type=str, default=None,
+             help = "Load the distance matrix to be used for samples [default None].")
+        arg( '--save_dist_matrix_f', type=str, default=None,
+             help = "Save the distance matrix for features to file [default None].")
+        arg( '--save_dist_matrix_s', type=str, default=None,
+             help = "Save the distance matrix for samples to file [default None].")
+    
+    def __init__( self, data, args = None ):
+        self.sdf = args.s_dist_f
+        self.fdf = args.f_dist_f
+
+        self.s_cdist_matrix, self.f_cdist_matrix = None, None
+
+        self.numpy_full_matrix = (data if 
+                type(data) == np.matrixlib.defmatrix.matrix else None)
+    
+    def compute_f_dists( self ):
+        if args.load_dist_matrix_f:
+            with open( args.load_dist_matrix_f ) as inp:
+                self.f_cdist_matrix = pickle.load( inp )
+
+        else:
+            dt = self.numpy_full_matrix
+            
+            if self.fdf == "spearman":
+                dt_ranked = np.matrix([stats.rankdata(d) for d in dt])
+                self.f_cdist_matrix = spd.pdist( dt_ranked, "correlation" )
+                return
+        
+            if self.fdf == "pearson":
+                self.fdf = 'correlation'
+
+            self.f_cdist_matrix = spd.pdist( dt, self.fdf )
+
+        if args.save_dist_matrix_f:
+            with open( args.save_dist_matrix_f, "wb" ) as outf:
+                pickle.dump( self.f_cdist_matrix, outf )
+    
+    def compute_s_dists( self ):
+        if args.load_dist_matrix_s:
+            with open( args.load_dist_matrix_s ) as inp:
+                self.s_cdist_matrix = pickle.load( inp )
+        else: 
+            dt = self.numpy_full_matrix.transpose()
+            
+            if self.sdf == "spearman":
+                dt_ranked = np.matrix([stats.rankdata(d) for d in dt])
+                self.s_cdist_matrix = spd.pdist( dt_ranked, "correlation" )
+                return
+            
+            if self.sdf == "pearson":
+                self.sdf = 'correlation'
+        
+            self.s_cdist_matrix = spd.pdist( dt, self.sdf )
+        
+        if args.save_dist_matrix_s:
+            with open( args.save_dist_matrix_s, "wb" ) as outf:
+                pickle.dump( self.s_cdist_matrix, outf )
+
+    def get_s_dm( self ):
+        return self.s_cdist_matrix
+
+    def get_f_dm( self ):
+        return self.f_cdist_matrix
+
+class HClustering:
+    datatype = 'hclustering'
+
+    @staticmethod
+    def input_parameters( parser ):
+        cl_param = parser.add_argument_group('Clustering parameters')
+        arg = cl_param.add_argument
+
+        linkage_method = [ "single","complete","average", 
+                           "weighted","centroid","median",
+                           "ward" ]
+        arg( '--no_fclustering', action='store_true',
+             help = "avoid clustering features" )
+        arg( '--no_sclustering', action='store_true',
+             help = "avoid clustering samples" )
+        arg( '--flinkage', type=str, default="average",
+             help = "Linkage method for feature clustering [default average]")
+        arg( '--slinkage', type=str, default="average",
+             help = "Linkage method for sample clustering [default average]")
+
+    def get_reordered_matrix( self, matrix, sclustering = True, fclustering = True ):
+        if not sclustering and not fclustering:
+            return matrix
+        
+        idx1 = self.sdendrogram['leaves'] if sclustering else None   # !!!!!!!!!!!
+        idx2 = self.fdendrogram['leaves'][::-1] if fclustering else None
+
+        if sclustering and fclustering:
+            return matrix[idx2,:][:,idx1]
+        if fclustering:
+            return matrix[idx2,:][:]
+        if sclustering: # !!!!!!!!!!!!
+            return matrix[:][:,idx1]
+
+    def get_reordered_sample_labels( self, slabels ):
+        return [slabels[i] for i in self.sdendrogram['leaves']]
+
+    def get_reordered_feature_labels( self, flabels ):
+        return [flabels[i] for i in self.fdendrogram['leaves']]
+    
+    def __init__( self, s_dm, f_dm, args = None ):
+        self.s_dm = s_dm
+        self.f_dm = f_dm
+        self.args = args
+        self.sclusters = None
+        self.fclusters = None
+        self.sdendrogram = None
+        self.fdendrogram = None
+
+    def shcluster( self, dendrogram = True ):
+        self.shclusters = sph.linkage( self.s_dm, args.slinkage ) 
+        if dendrogram:
+            self.sdendrogram = sph.dendrogram( self.shclusters, no_plot=True )
+
+    def fhcluster( self, dendrogram = True ):
+        self.fhclusters = sph.linkage( self.f_dm, args.flinkage ) 
+        if dendrogram:
+            self.fdendrogram = sph.dendrogram( self.fhclusters, no_plot=True )
+    
+    def get_shclusters( self ):
+        return self.shclusters
+    
+    def get_fhclusters( self ):
+        return self.fhclusters
+    
+    def get_sdendrogram( self ):
+        return self.sdendrogram
+    
+    def get_fdendrogram( self ):
+        return self.fdendrogram
+
+
+class Heatmap:
+    datatype = 'heatmap'
+   
+    bbcyr = {'red':  (  (0.0, 0.0, 0.0),
+                        (0.25, 0.0, 0.0),
+                        (0.50, 0.0, 0.0),
+                        (0.75, 1.0, 1.0),
+                        (1.0, 1.0, 1.0)),
+             'green': ( (0.0, 0.0, 0.0),
+                        (0.25, 0.0, 0.0),
+                        (0.50, 1.0, 1.0),
+                        (0.75, 1.0, 1.0),
+                        (1.0, 0.0, 1.0)),
+             'blue': (  (0.0, 0.0, 0.0),
+                        (0.25, 1.0, 1.0),
+                        (0.50, 1.0, 1.0),
+                        (0.75, 0.0, 0.0),
+                        (1.0, 0.0, 1.0))}
+
+    bbcry = {'red':  (  (0.0, 0.0, 0.0),
+                        (0.25, 0.0, 0.0),
+                        (0.50, 0.0, 0.0),
+                        (0.75, 1.0, 1.0),
+                        (1.0, 1.0, 1.0)),
+             'green': ( (0.0, 0.0, 0.0),
+                        (0.25, 0.0, 0.0),
+                        (0.50, 1.0, 1.0),
+                        (0.75, 0.0, 0.0),
+                        (1.0, 1.0, 1.0)),
+             'blue': (  (0.0, 0.0, 0.0),
+                        (0.25, 1.0, 1.0),
+                        (0.50, 1.0, 1.0),
+                        (0.75, 0.0, 0.0),
+                        (1.0, 0.0, 1.0))}
+
+    bcry = {'red':  (   (0.0, 0.0, 0.0),
+                        (0.33, 0.0, 0.0),
+                        (0.66, 1.0, 1.0),
+                        (1.0, 1.0, 1.0)),
+             'green': ( (0.0, 0.0, 0.0),
+                        (0.33, 1.0, 1.0),
+                        (0.66, 0.0, 0.0),
+                        (1.0, 1.0, 1.0)),
+             'blue': (  (0.0, 1.0, 1.0),
+                        (0.33, 1.0, 1.0),
+                        (0.66, 0.0, 0.0),
+                        (1.0, 0.0, 1.0))}
+    
+
+    my_colormaps = [    ('bbcyr',bbcyr),
+                        ('bbcry',bbcry),
+                        ('bcry',bcry)]
+   
+    dcols = ['#ca0000','#0087ff','#00ba1d','#cf00ff','#00dbe2','#ffaf00','#0017f4','#006012','#e175ff','#877878','#050505','#b5cf00','#ff8a8a','#aa6400','#50008a','#00ff58']
+
+
+    @staticmethod
+    def input_parameters( parser ):
+        hm_param = parser.add_argument_group('Heatmap options')
+        arg = hm_param.add_argument
+
+        arg( '--dpi', type=int, default=150,
+             help = "Image resolution in dpi [default 150]")
+        arg( '-l', '--log_scale', action='store_true',
+             help = "Log scale" )
+        arg( '-s', '--sqrt_scale', action='store_true',
+             help = "Square root scale" )
+        arg( '--no_slabels', action='store_true',
+             help = "Do not show sample labels" )
+        arg( '--minv', type=float, default=None,
+             help = "Minimum value to display in the color map [default None meaning automatic]" )
+        arg( '--maxv', type=float, default=None,
+             help = "Maximum value to display in the color map [default None meaning automatic]" )
+        arg( '--no_flabels', action='store_true',
+             help = "Do not show feature labels" )
+        arg( '--max_slabel_len', type=int, default=25,
+             help = "Max number of chars to report for sample labels [default 15]" )
+        arg( '--max_flabel_len', type=int, default=25,
+             help = "Max number of chars to report for feature labels [default 15]" )
+        arg( '--flabel_size', type=int, default=10,
+             help = "Feature label font size [default 10]" )
+        arg( '--slabel_size', type=int, default=10,
+             help = "Sample label font size [default 10]" )
+        arg( '--fdend_width', type=float, default=1.0,
+             help = "Width of the feature dendrogram [default 1 meaning 100%% of default heatmap width]")
+        arg( '--sdend_height', type=float, default=1.0,
+             help = "Height of the sample dendrogram [default 1 meaning 100%% of default heatmap height]")
+        arg( '--metadata_height', type=float, default=.05,
+             help = "Height of the metadata panel [default 0.05 meaning 5%% of default heatmap height]")
+        arg( '--metadata_separation', type=float, default=.01,
+             help = "Distance between the metadata and data panels. [default 0.001 meaning 0.1%% of default heatmap height]")
+        arg( '--image_size', type=float, default=8,
+             help = "Size of the largest between width and eight size for the image in inches [default 8]")
+        arg( '--cell_aspect_ratio', type=float, default=1.0,
+             help = "Aspect ratio between width and height for the cells of the heatmap [default 1.0]")
+        col_maps = ['Accent', 'Blues', 'BrBG', 'BuGn', 'BuPu', 'Dark2', 'GnBu',
+                    'Greens', 'Greys', 'OrRd', 'Oranges', 'PRGn', 'Paired',
+                    'Pastel1', 'Pastel2', 'PiYG', 'PuBu', 'PuBuGn', 'PuOr',
+                    'PuRd', 'Purples', 'RdBu', 'RdGy', 'RdPu', 'RdYlBu', 'RdYlGn',
+                    'Reds', 'Set1', 'Set2', 'Set3', 'Spectral', 'YlGn', 'YlGnBu',
+                    'YlOrBr', 'YlOrRd', 'afmhot', 'autumn', 'binary', 'bone',
+                    'brg', 'bwr', 'cool', 'copper', 'flag', 'gist_earth',
+                    'gist_gray', 'gist_heat', 'gist_ncar', 'gist_rainbow',
+                    'gist_stern', 'gist_yarg', 'gnuplot', 'gnuplot2', 'gray',
+                    'hot', 'hsv', 'jet', 'ocean', 'pink', 'prism', 'rainbow',
+                    'seismic', 'spectral', 'spring', 'summer', 'terrain', 'winter'] + [n for n,c in Heatmap.my_colormaps]
+        for n,c in Heatmap.my_colormaps:
+            my_cmap = matplotlib.colors.LinearSegmentedColormap(n,c,256)
+            pylab.register_cmap(name=n,cmap=my_cmap)
+        arg( '-c','--colormap', type=str, choices = col_maps, default = 'bbcry' )
+        arg( '--bottom_c', type=str, default = None,
+             help = "Color to use for cells below the minimum value of the scale [default None meaning bottom color of the scale]")
+        arg( '--top_c', type=str, default = None,
+             help = "Color to use for cells below the maximum value of the scale [default None meaning bottom color of the scale]")
+        arg( '--nan_c', type=str, default = None,
+             help = "Color to use for nan cells  [default None]")
+
+        
+
+        """
+        arg( '--', type=str, default="average",
+             help = "Linkage method for feature clustering [default average]")
+        arg( '--slinkage', type=str, default="average",
+             help = "Linkage method for sample clustering [default average]")
+        """
+
+    def __init__( self, numpy_matrix, sdendrogram, fdendrogram, snames, fnames, fnames_meta, args = None ):
+        self.numpy_matrix = numpy_matrix
+        self.sdendrogram = sdendrogram
+        self.fdendrogram = fdendrogram
+        self.snames = snames
+        self.fnames = fnames
+        self.fnames_meta = fnames_meta
+        self.ns,self.nf = self.numpy_matrix.shape
+        self.args = args
+
+    def make_legend( self, dmap, titles, out_fn ): 
+        figlegend = plt.figure(figsize=(1+3*len(titles),2), frameon = False)
+
+        gs = gridspec.GridSpec( 1, len(dmap), wspace = 2.0  )
+
+        for i,(d,title) in enumerate(zip(dmap,titles)):
+            legax = plt.subplot(gs[i],frameon = False)
+            for k,v in sorted(d.items(),key=lambda x:x[1]):
+                rect = Rectangle( [0.0, 0.0], 0.0, 0.0,
+                                  facecolor = self.dcols[v%len(self.dcols)],
+                                  label = k,
+                                  edgecolor='b', lw = 0.0)
+
+                legax.add_patch(rect)
+        #remove_splines( legax )
+            legax.set_xticks([])
+            legax.set_yticks([])
+            legax.legend( loc = 2, frameon = False, title = title) 
+        """
+                      ncol = legend_ncol, bbox_to_anchor=(1.01, 3.),
+                      borderpad = 0.0, labelspacing = 0.0,
+                      handlelength = 0.5, handletextpad = 0.3,
+                      borderaxespad = 0.0, columnspacing = 0.3,
+                      prop = {'size':fontsize}, frameon = False)
+        """
+        if out_fn:
+            figlegend.savefig(out_fn, bbox_inches='tight')
+    
+    def draw( self ):
+
+        rat = float(self.ns)/self.nf
+        rat *= self.args.cell_aspect_ratio
+        x,y = (self.args.image_size,rat*self.args.image_size) if rat < 1 else (self.args.image_size/rat,self.args.image_size)
+        fig = plt.figure( figsize=(x,y), facecolor = 'w'  )
+
+        cm = pylab.get_cmap(self.args.colormap)
+        bottom_col = [  cm._segmentdata['red'][0][1],
+                        cm._segmentdata['green'][0][1],
+                        cm._segmentdata['blue'][0][1]   ]
+        if self.args.bottom_c:
+            bottom_col = self.args.bottom_c
+        cm.set_under( bottom_col )
+        top_col = [  cm._segmentdata['red'][-1][1],
+                     cm._segmentdata['green'][-1][1],
+                     cm._segmentdata['blue'][-1][1]   ]
+        if self.args.top_c:
+            top_col = self.args.top_c
+        cm.set_over( top_col )
+
+        if self.args.nan_c:
+            cm.set_bad( self.args.nan_c  )
+
+        def make_ticklabels_invisible(ax):
+            for tl in ax.get_xticklabels() + ax.get_yticklabels():
+                 tl.set_visible(False)
+            ax.set_xticks([])
+            ax.set_yticks([])
+      
+        def remove_splines( ax ):
+            for v in ['right','left','top','bottom']:
+                ax.spines[v].set_color('none')
+
+        def shrink_labels( labels, n ):
+            shrink = lambda x: x[:n/2]+" [...] "+x[-n/2:]
+            return [(shrink(str(l)) if len(str(l)) > n else l) for l in labels]
+        
+
+        #gs = gridspec.GridSpec( 4, 2, 
+        #                        width_ratios=[1.0-fr_ns,fr_ns], 
+        #                        height_ratios=[.03,0.03,1.0-fr_nf,fr_nf], 
+        #                        wspace = 0.0, hspace = 0.0 )
+        
+        fr_ns = float(self.ns)/max([self.ns,self.nf])
+        fr_nf = float(self.nf)/max([self.ns,self.nf])
+       
+        buf_space = 0.05
+        minv = min( [buf_space*8, 8*rat*buf_space] )
+        if minv < 0.05:
+            buf_space /= minv/0.05
+        metadata_height = self.args.metadata_height if type(snames[0]) is tuple and len(snames[0]) > 1 else 0.000001 
+        gs = gridspec.GridSpec( 6, 4, 
+                                width_ratios=[ buf_space, buf_space*2, .08*self.args.fdend_width,0.9], 
+                                height_ratios=[ buf_space, buf_space*2, .08*self.args.sdend_height, metadata_height, self.args.metadata_separation, 0.9], 
+                                wspace = 0.0, hspace = 0.0 )
+
+        ax_hm = plt.subplot(gs[23], axisbg = bottom_col  )
+        ax_metadata = plt.subplot(gs[15], axisbg = bottom_col  )
+        ax_hm_y2 = ax_hm.twinx() 
+
+        norm_f = matplotlib.colors.Normalize
+        if self.args.log_scale:
+            norm_f = matplotlib.colors.LogNorm
+        elif self.args.sqrt_scale:
+            norm_f = SqrtNorm
+        minv, maxv = 0.0, None
+
+        maps, values, ndv = [], [], 0
+        if type(snames[0]) is tuple and len(snames[0]) > 1:
+            metadata = zip(*[list(s[1:]) for s in snames])
+            for m in metadata:
+                mmap = dict([(v[1],ndv+v[0]) for v in enumerate(list(set(m)))])
+                values.append([mmap[v] for v in m])
+                ndv += len(mmap)
+                maps.append(mmap)
+            dcols = [] 
+            mdmat = np.matrix(values)
+            while len(dcols) < ndv:
+                dcols += self.dcols
+            cmap = matplotlib.colors.ListedColormap(dcols[:ndv]) 
+            bounds = [float(f)-0.5 for f in range(ndv+1)]
+            imm = ax_metadata.imshow( mdmat, #origin='lower', 
+                    interpolation = 'nearest',  
+                                    aspect='auto', 
+                                    extent = [0, self.nf, 0, self.ns], 
+                                    cmap=cmap,
+                                    vmin=bounds[0],
+                                    vmax=bounds[-1],
+                                    )
+            remove_splines( ax_metadata )
+            ax_metadata_y2 = ax_metadata.twinx() 
+            ax_metadata_y2.set_ylim(0,len(self.fnames_meta))
+            ax_metadata.set_yticks([])
+            ax_metadata_y2.set_ylim(0,len(self.fnames_meta))
+            ax_metadata_y2.tick_params(length=0)
+            ax_metadata_y2.set_yticks(np.arange(len(self.fnames_meta))+0.5)
+            ax_metadata_y2.set_yticklabels(self.fnames_meta[::-1], va='center',size=self.args.flabel_size)
+        else:
+            ax_metadata.set_yticks([])
+
+        ax_metadata.set_xticks([])
+        
+        im = ax_hm.imshow( self.numpy_matrix, #origin='lower', 
+                                interpolation = 'nearest',  aspect='auto', 
+                                extent = [0, self.nf, 0, self.ns], 
+                                cmap=cm, 
+                                vmin=self.args.minv,
+                                vmax=self.args.maxv, 
+                                norm = norm_f( vmin=minv if minv > 0.0 else None, vmax=maxv)
+                                )
+        
+        #ax_hm.set_ylim([0,800])
+        ax_hm.set_xticks(np.arange(len(list(snames)))+0.5)
+        if not self.args.no_slabels:
+            snames_short = shrink_labels( list([s[0] for s in snames]) if type(snames[0]) is tuple else snames, self.args.max_slabel_len )
+            ax_hm.set_xticklabels(snames_short,rotation=90,va='top',ha='center',size=self.args.slabel_size)
+        else:
+            ax_hm.set_xticklabels([])
+        ax_hm_y2.set_ylim([0,self.ns])
+        ax_hm_y2.set_yticks(np.arange(len(fnames))+0.5)
+        if not self.args.no_flabels:
+            fnames_short = shrink_labels( fnames, self.args.max_flabel_len )
+            ax_hm_y2.set_yticklabels(fnames_short,va='center',size=self.args.flabel_size)
+        else:
+            ax_hm_y2.set_yticklabels( [] )
+        ax_hm.set_yticks([])
+        remove_splines( ax_hm )
+        ax_hm.tick_params(length=0)
+        ax_hm_y2.tick_params(length=0)
+        #ax_hm.set_xlim([0,self.ns])
+        ax_cm = plt.subplot(gs[3], axisbg = 'r', frameon = False)
+        #fig.colorbar(im, ax_cm, orientation = 'horizontal', spacing = 'proportional', format = ticker.LogFormatterMathtext() )
+        fig.colorbar(im, ax_cm, orientation = 'horizontal', spacing='proportional' if self.args.sqrt_scale else 'uniform' ) # , format = ticker.LogFormatterMathtext() )
+
+        if not self.args.no_sclustering:
+            ax_den_top = plt.subplot(gs[11], axisbg = 'r', frameon = False)
+            sph._plot_dendrogram( self.sdendrogram['icoord'], self.sdendrogram['dcoord'], self.sdendrogram['ivl'],
+                                  self.ns + 1, self.nf + 1, 1, 'top', no_labels=True,
+                                  color_list=self.sdendrogram['color_list'] )
+            ymax = max([max(a) for a in self.sdendrogram['dcoord']])
+            ax_den_top.set_ylim([0,ymax])
+            make_ticklabels_invisible( ax_den_top )
+        if not self.args.no_fclustering:
+            ax_den_right = plt.subplot(gs[22], axisbg = 'b', frameon = False)
+            sph._plot_dendrogram(   self.fdendrogram['icoord'], self.fdendrogram['dcoord'], self.fdendrogram['ivl'],
+                                    self.ns + 1, self.nf + 1, 1, 'right', no_labels=True,
+                                    color_list=self.fdendrogram['color_list'] )
+            xmax = max([max(a) for a in self.fdendrogram['dcoord']])
+            ax_den_right.set_xlim([xmax,0])
+            make_ticklabels_invisible( ax_den_right )
+
+        
+        if not self.args.out:
+            plt.show( )
+        else:
+            fig.savefig( self.args.out, bbox_inches='tight', dpi = self.args.dpi )
+            if maps: 
+                self.make_legend( maps, fnames_meta, self.args.legend_file ) 
+
+
+
+class ReadCmd:
+
+    def __init__( self ):
+        import argparse as ap
+        import textwrap
+
+        p = ap.ArgumentParser( description= "TBA" )
+        arg = p.add_argument
+        
+        arg( '-i', '--inp', '--in', metavar='INPUT_FILE', type=str, nargs='?', default=sys.stdin,
+             help= "The input matrix" )
+        arg( '-o', '--out', metavar='OUTPUT_FILE', type=str, nargs='?', default=None,
+             help= "The output image file [image on screen of not specified]" )
+        arg( '--legend_file', metavar='LEGEND_FILE', type=str, nargs='?', default=None,
+             help= "The output file for the legend of the provided metadata" )
+
+        input_types = [DataMatrix.datatype,DistMatrix.datatype]
+        arg( '-t', '--input_type', metavar='INPUT_TYPE', type=str, choices = input_types, 
+             default='data_matrix',
+             help= "The input type can be a data matrix or distance matrix [default data_matrix]" )
+
+        DataMatrix.input_parameters( p )
+        DistMatrix.input_parameters( p )
+        HClustering.input_parameters( p )
+        Heatmap.input_parameters( p )
+
+        self.args  = p.parse_args()
+
+    def check_consistency( self ):
+        pass
+
+    def get_args( self ):
+        return self.args
+
+if __name__ == '__main__':
+     
+    read = ReadCmd( )
+    read.check_consistency()
+    args = read.get_args()
+    
+    if args.input_type == DataMatrix.datatype:
+        dm = DataMatrix( args.inp, args ) 
+        if args.out_table:
+            dm.save_matrix( args.out_table )
+        
+        distm = DistMatrix( dm.get_numpy_matrix(), args = args )
+        if not args.no_sclustering:
+            distm.compute_s_dists()
+        if not args.no_fclustering:
+            distm.compute_f_dists()
+    elif args.input_type == DataMatrix.datatype:
+        # distm = read...
+        pass
+    else:
+        pass
+
+    cl = HClustering( distm.get_s_dm(), distm.get_f_dm(), args = args )
+    if not args.no_sclustering:
+        cl.shcluster()
+    if not args.no_fclustering:
+        cl.fhcluster()
+    
+    hmp = dm.get_numpy_matrix()
+    fnames = dm.get_fnames()
+    snames = dm.get_snames()
+    fnames_meta = snames.names[1:]
+    #if not args.no_sclustering or not args.no_fclustering ):
+    
+    hmp = cl.get_reordered_matrix( hmp, sclustering = not args.no_sclustering, fclustering = not args.no_fclustering  )
+    if not args.no_sclustering:
+        snames = cl.get_reordered_sample_labels( snames )
+    if not args.no_fclustering:
+        fnames = cl.get_reordered_feature_labels( fnames )
+    else:
+        fnames = fnames[::-1]
+
+    hm = Heatmap( hmp, cl.sdendrogram, cl.fdendrogram, snames, fnames, fnames_meta, args = args )
+    hm.draw()
+
+
+
+
+
+
Binary file hclust2/hclust2.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_metaphlan.xml	Tue Aug 26 14:51:29 2014 -0400
@@ -0,0 +1,20 @@
+<tool id="merge_metaphlan" name="merge_metaphlan" version="1.0.0">
+  <description>Merge Metaphlan</description>
+  <command interpreter="python">merge_metaphlan_tables.py
+	 #for $input in $inputs#
+		$input
+	#end for#  
+	> $out_data
+   </command>
+	<inputs>
+		  <param format="text" name="inputs" type="data" label="Metaphlan files to merge"  multiple="true"    help="Metaphlan files to merege"/>
+    </inputs>
+
+	<outputs>
+            <data  name="out_data"  format="tabular"  />
+	</outputs>
+                                  
+  <help>
+**merge metaphlan**  
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_metaphlan_tables.py	Tue Aug 26 14:51:29 2014 -0400
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+
+# ============================================================================== 
+# Merge script: from MetaPhlAn output on single sample to a joined "clades vs samples" table
+# Authors: Timothy Tickle (ttickle@hsph.harvard.edu) and Curtis Huttenhower (chuttenh@hsph.harvard.edu)
+# ==============================================================================
+
+import argparse
+import csv
+import os
+import sys
+
+
+def merge( aaastrIn, astrLabels, iCol, ostm ):
+	"""
+	Outputs the table join of the given pre-split string collection.
+	
+	:param	aaastrIn:	One or more split lines from which data are read.
+	:type	aaastrIn:	collection of collections of string collections
+	:param	astrLabels:	File names of input data.
+	:type	astrLabels:	collection of strings
+	:param	iCol:		Data column in which IDs are matched (zero-indexed).
+	:type	iCol:		int
+	:param	ostm:		Output stream to which matched rows are written.
+	:type	ostm:		output stream
+
+	"""
+	
+	setstrIDs = set()
+	"""The final set of all IDs in any table."""
+	ahashIDs = [{} for i in range( len( aaastrIn ) )]
+	"""One hash of IDs to row numbers for each input datum."""
+	aaastrData = [[] for i in range( len( aaastrIn ) )]
+	"""One data table for each input datum."""
+	aastrHeaders = [[] for i in range( len( aaastrIn ) )]
+	"""The list of non-ID headers for each input datum."""
+	strHeader = "ID"
+	"""The ID column header."""
+
+	# For each input datum in each input stream...
+	pos = 0
+
+	for f in aaastrIn :
+		with open(f) as csvfile :
+			iIn = csv.reader(csvfile, csv.excel_tab)
+
+			# Lines from the current file, empty list to hold data, empty hash to hold ids
+			aastrData, hashIDs = (a[pos] for a in (aaastrData, ahashIDs))
+
+			iLine = -1
+			# For a line in the file
+			for astrLine in iIn:
+				iLine += 1
+
+				# ID is from first column, data are everything else
+				strID, astrData = astrLine[iCol], ( astrLine[:iCol] + astrLine[( iCol + 1 ):] )
+
+				hashIDs[strID] = iLine
+				aastrData.append( astrData )
+
+			# Batch merge every new ID key set
+			setstrIDs.update( hashIDs.keys( ) )
+
+		pos += 1
+
+	# Create writer
+	csvw = csv.writer( ostm, csv.excel_tab, lineterminator='\n' )
+
+	# Make the file names the column names
+	csvw.writerow( [strHeader] + [os.path.splitext(f)[0] for f in astrLabels] )
+
+	# Write out data
+	for strID in sorted( setstrIDs ):
+		astrOut = []
+		for iIn in range( len( aaastrIn ) ):
+			aastrData, hashIDs = (a[iIn] for a in (aaastrData, ahashIDs))
+			# Look up the row number of the current ID in the current dataset, if any
+			iID = hashIDs.get( strID )
+			# If not, start with no data; if yes, pull out stored data row
+			astrData = [0.0] if ( iID == None ) else aastrData[iID]
+			# Pad output data as needed
+			astrData += [None] * ( len( aastrHeaders[iIn] ) - len( astrData ) )
+			astrOut += astrData
+		csvw.writerow( [strID] + astrOut )
+
+
+argp = argparse.ArgumentParser( prog = "merge_metaphlan_tables.py",
+	description = """Performs a table join on one or more metaphlan output files.""")
+argp.add_argument( "aistms",	metavar = "input.txt", nargs = "+",
+	help = "One or more tab-delimited text tables to join" )
+
+__doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" )
+
+argp.usage = argp.format_usage()[7:]+"\n\n\tPlease make sure to supply file paths to the files to combine. If combining 3 files (Table1.txt, Table2.txt, and Table3.txt) the call should be:\n\n\t\tpython merge_metaphlan_tables.py Table1.txt Table2.txt Table3.txt > output.txt\n\n\tA wildcard to indicate all .txt files that start with Table can be used as follows:\n\n\t\tpython merge_metaphlan_tables.py Table*.txt > output.txt"
+
+
+def _main( ):
+	args = argp.parse_args( )
+	merge(args.aistms, [os.path.split(os.path.basename(f))[1] for f in args.aistms], 0, sys.stdout)
+
+
+if __name__ == "__main__":
+	_main( )