micropita: MicroPITA.py comparison

comparison MicroPITA.py @ 28:1d09ffab87a7 draft

Uploaded MicroPITA.py - fixed spaces and tabs inconsistencies

author	george-weingart
date	Tue, 22 Jun 2021 03:23:17 +0000
parents	7d25ecd225dd
children

comparison

equal deleted inserted replaced

-:d9862a9a4d84
+:1d09ffab87a7
 Author: Timothy Tickle
 Description: Class to Run analysis for the microPITA paper
 """
 #####################################################################################
-#Copyright (C) <2012>
+# Copyright (C) <2012>
 #
-#Permission is hereby granted, free of charge, to any person obtaining a copy of
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
-#this software and associated documentation files (the "Software"), to deal in the
+# this software and associated documentation files (the "Software"), to deal in the
-#Software without restriction, including without limitation the rights to use, copy,
+# Software without restriction, including without limitation the rights to use, copy,
-#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+# modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
-#and to permit persons to whom the Software is furnished to do so, subject to
+# and to permit persons to whom the Software is furnished to do so, subject to
-#the following conditions:
+# the following conditions:
 #
-#The above copyright notice and this permission notice shall be included in all copies
+# The above copyright notice and this permission notice shall be included in all copies
-#or substantial portions of the Software.
+# or substantial portions of the Software.
 #
-#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
-#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #####################################################################################
+from types import *
+import scipy.spatial.distance
+import scipy.cluster.hierarchy as hcluster
+import random
+import os
+import operator
+import numpy as np
+import mlpy
+import math
+import logging
+import csv
+from src.ConstantsMicropita import ConstantsMicropita
+from src.breadcrumbs.src.UtilityMath import UtilityMath
+from src.breadcrumbs.src.SVM import SVM
+from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor
+from src.breadcrumbs.src.KMedoids import Kmedoids
+from src.breadcrumbs.src.Metric import Metric
+from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs
+from src.breadcrumbs.src.AbundanceTable import AbundanceTable
 __author__ = "Timothy Tickle"
 __copyright__ = "Copyright 2012"
 __credits__ = ["Timothy Tickle"]
 __license__ = "MIT"
 __maintainer__ = "Timothy Tickle"
 __email__ = "ttickle@sph.harvard.edu"
 __status__ = "Development"
 import sys
 import argparse
-from src.breadcrumbs.src.AbundanceTable import AbundanceTable
 import warnings
-warnings.simplefilter(action = "ignore", category = FutureWarning)
+warnings.simplefilter(action="ignore", category=FutureWarning)
-from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs
-from src.breadcrumbs.src.Metric import Metric
-from src.breadcrumbs.src.KMedoids import Kmedoids
-from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor
-from src.breadcrumbs.src.SVM import SVM
-from src.breadcrumbs.src.UtilityMath import UtilityMath
-from src.ConstantsMicropita import ConstantsMicropita
-import csv
-import logging
-import math
-import mlpy
-import numpy as np
-import operator
-import os
-import random
-import scipy.cluster.hierarchy as hcluster
-import scipy.spatial.distance
-from types import *
 class MicroPITA:
-	"""
+"""
-	Selects samples from a first tier of a multi-tiered study to be used in a second tier.
+Selects samples from a first tier of a multi-tiered study to be used in a second tier.
-	Different methods can be used for selection.
+Different methods can be used for selection.
-	The expected input is an abundance table (and potentially a text file of targeted features,
+The expected input is an abundance table (and potentially a text file of targeted features,
-	if using the targeted features option). Output is a list of samples exhibiting the
+if using the targeted features option). Output is a list of samples exhibiting the
-	characteristics of interest.
+characteristics of interest.
-	"""
+"""
-	#Constants
+# Constants
-	#Diversity metrics Alpha
+# Diversity metrics Alpha
-	c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity
+c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity
-	c_strChao1Diversity = Metric.c_strChao1Diversity
+c_strChao1Diversity = Metric.c_strChao1Diversity
-	#Diversity metrics Beta
+# Diversity metrics Beta
-	c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity
+c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity
-	#Additive inverses of diversity metrics beta
+# Additive inverses of diversity metrics beta
-	c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity
+c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity
-	#Technique Names
+# Technique Names
-	ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C"
+ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C"
-	#Targeted feature settings
+# Targeted feature settings
-	c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked
+c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked
-	c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance
+c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance
-	#Technique groupings
+# Technique groupings
 #	c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2]
-	#Converts ecology metrics into standardized method selection names
+# Converts ecology metrics into standardized method selection names
-	dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2}
+dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity: ConstantsMicropita.c_strDiversity,
+c_strChao1Diversity: ConstantsMicropita.c_strDiversity2}
 #	dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity}
-	dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative}
+dictConvertBMetricToMethod = {
-	dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme}
+c_strBrayCurtisDissimilarity: ConstantsMicropita.c_strRepresentative}
+dictConvertInvBMetricToMethod = {
-	#Linkage used in the Hierarchical clustering
+c_strBrayCurtisDissimilarity: ConstantsMicropita.c_strExtreme}
-	c_strHierarchicalClusterMethod = 'average'
+# Linkage used in the Hierarchical clustering
-####Group 1## Diversity
+c_strHierarchicalClusterMethod = 'average'
-	#Testing: Happy path Testing (8)
-	def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None):
+# Group 1## Diversity
-		"""
+# Testing: Happy path Testing (8)
-		Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given
+def funcGetTopRankedSamples(self, lldMatrix=None, lsSampleNames=None, iTopAmount=None):
-			it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample
+"""
-			names associated with the indices.
+Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given
+it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample
-		:param	lldMatrix:	List of lists [[value,value,value,value],[value,value,value,value]].
+names associated with the indices.
-		:type:	List of lists	List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample.
-		:param	lsSampleNames:	List of sample names positionally related (the same) to each list (Optional).
+:param	lldMatrix:	List of lists [[value,value,value,value],[value,value,value,value]].
-		:type:	List of strings	List of strings.
+:type:	List of lists	List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample.
-		:param	iTopAmount:	The amount of top measured samples (assumes the higher measurements are better).
+:param	lsSampleNames:	List of sample names positionally related (the same) to each list (Optional).
-		:type:	integer	Integer amount of sample names/ indices to return.
+:type:	List of strings	List of strings.
-		:return	List:	List of samples to be selected.
+:param	iTopAmount:	The amount of top measured samples (assumes the higher measurements are better).
-		"""
+:type:	integer	Integer amount of sample names/ indices to return.
-		topRankListRet = []
+:return	List:	List of samples to be selected.
-		for rowMetrics in lldMatrix:
+"""
-			#Create 2 d array to hold value and index and sort
+topRankListRet = []
-			liIndexX = [rowMetrics,range(len(rowMetrics))]
+for rowMetrics in lldMatrix:
-			liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True)
+# Create 2 d array to hold value and index and sort
+liIndexX = [rowMetrics, range(len(rowMetrics))]
-			if lsSampleNames:
+liIndexX[1].sort(key=liIndexX[0].__getitem__, reverse=True)
-				topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]])
-			else:
+if lsSampleNames:
-				topRankListRet.append(liIndexX[1][:iTopAmount])
+topRankListRet.append([lsSampleNames[iIndex]
+for iIndex in liIndexX[1][:iTopAmount]])
-		return topRankListRet
+else:
+topRankListRet.append(liIndexX[1][:iTopAmount])
-	####Group 2## Representative Dissimilarity
-	#Testing: Happy path tested 1
+return topRankListRet
-	def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
-		"""
+# Group 2## Representative Dissimilarity
-		Gets centroid samples by k-medoids clustering of a given matrix.
+# Testing: Happy path tested 1
+def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
-		:param	npaMatrix:	Numpy array where row=features and columns=samples
+"""
-		:type:	Numpy array	Abundance Data.
+Gets centroid samples by k-medoids clustering of a given matrix.
-		:param	sMetric:	String name of beta metric used as the distance metric.
-		:type:	String	String name of beta metric.
+:param	npaMatrix:	Numpy array where row=features and columns=samples
-		:param	lsSampleNames:	The names of the sample
+:type:	Numpy array	Abundance Data.
-		:type:	List	List of strings
+:param	sMetric:	String name of beta metric used as the distance metric.
-		:param	iNumberSamplesReturned:	Number of samples to return, each will be a centroid of a sample.
+:type:	String	String name of beta metric.
-		:type:	Integer	Number of samples to return
+:param	lsSampleNames:	The names of the sample
-		:return	List:	List of selected samples.
+:type:	List	List of strings
-		:param	istmBetaMatrix: File with beta-diversity matrix
+:param	iNumberSamplesReturned:	Number of samples to return, each will be a centroid of a sample.
-		:type:	File stream or file path string
+:type:	Integer	Number of samples to return
-		"""
+:return	List:	List of selected samples.
+:param	istmBetaMatrix: File with beta-diversity matrix
-		#Count of how many rows
+:type:	File stream or file path string
-		sampleCount = npaMatrix.shape[0]
+"""
-		if iNumberSamplesReturned > sampleCount:
-			logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".")
+# Count of how many rows
-			return False
+sampleCount = npaMatrix.shape[0]
+if iNumberSamplesReturned > sampleCount:
-		#If the cluster count is equal to the sample count return all samples
+logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = " +
-		if sampleCount == iNumberSamplesReturned:
+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".")
-			return list(lsSampleNames)
+return False
-		#Get distance matrix
+# If the cluster count is equal to the sample count return all samples
-		distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames)
+if sampleCount == iNumberSamplesReturned:
-		if type(distanceMatrix) is BooleanType:
+return list(lsSampleNames)
-			logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.")
-			return False
+# Get distance matrix
+distanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix, lsSampleOrder=lsSampleNames)[
-		# Handle unifrac output
+0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames)
-		if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
+if type(distanceMatrix) is BooleanType:
-			distanceMatrix = distanceMatrix[0]
+logging.error(
+"MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.")
-		#Log distance matrix
+return False
-		logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric))
+# Handle unifrac output
-		distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True)
+if sMetric in [Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted]:
+distanceMatrix = distanceMatrix[0]
-		#Create object to determine clusters/medoids
-		medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance)
+# Log distance matrix
-		#medoidsData includes(1d numpy array, medoids indexes;
+logging.debug(
-		#			  1d numpy array, non-medoids indexes;
+"MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric))
-		#			  1d numpy array, cluster membership for non-medoids;
-		#			  double, cost of configuration)
+distance = MLPYDistanceAdaptor(
-		#npaMatrix is samples x rows
+npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True)
-		#Build a matrix of lists of indicies to pass to the distance matrix
-		lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))]
+# Create object to determine clusters/medoids
-		medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix))
+medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance)
-		logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:")
+# medoidsData includes(1d numpy array, medoids indexes;
-		logging.debug(str(medoidsData))
+#			  1d numpy array, non-medoids indexes;
+#			  1d numpy array, cluster membership for non-medoids;
-		#If returning the same amount of clusters and samples
+#			  double, cost of configuration)
-		#Return centroids
+# npaMatrix is samples x rows
-		selectedIndexes = medoidsData[0]
+# Build a matrix of lists of indicies to pass to the distance matrix
-		return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)]
+lliIndicesMatrix = [[iIndexPosition]
+for iIndexPosition in xrange(0, len(npaMatrix))]
-	####Group 3## Highest Dissimilarity
+medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix))
-	#Testing: Happy path tested
+logging.debug(
-	def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
+"MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:")
-		"""
+logging.debug(str(medoidsData))
-		Select extreme samples from HClustering.
+# If returning the same amount of clusters and samples
-		:param	strBetaMetric:	The beta metric to use for distance matrix generation.
+# Return centroids
-		:type:	String	The name of the beta metric to use.
+selectedIndexes = medoidsData[0]
-		:param	npaAbundanceMatrix:	Numpy array where row=samples and columns=features.
+return [lsSampleNames[selectedIndexes[index]] for index in xrange(0, iNumberSamplesReturned)]
-		:type:	Numpy Array	Abundance data.
-		:param	lsSampleNames:	The names of the sample.
+# Group 3## Highest Dissimilarity
-		:type:	List	List of strings.
+# Testing: Happy path tested
-		:param	iSelectSampleCount:	Number of samples to select (return).
+def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
-		:type:	Integer	Integer number of samples returned.
+"""
-		:return	Samples:	List of samples.
+Select extreme samples from HClustering.
-		:param	istmBetaMatrix: File with beta-diversity matrix
-		:type:	File stream or file path string
+:param	strBetaMetric:	The beta metric to use for distance matrix generation.
-		"""
+:type:	String	The name of the beta metric to use.
+:param	npaAbundanceMatrix:	Numpy array where row=samples and columns=features.
-		#If they want all the sample count, return all sample names
+:type:	Numpy Array	Abundance data.
-		iSampleCount=len(npaAbundanceMatrix[:,0])
+:param	lsSampleNames:	The names of the sample.
-		if iSelectSampleCount==iSampleCount:
+:type:	List	List of strings.
-		  return lsSampleNames
+:param	iSelectSampleCount:	Number of samples to select (return).
+:type:	Integer	Integer number of samples returned.
-		#Holds the samples to be returned
+:return	Samples:	List of samples.
-		lsReturnSamplesRet = []
+:param	istmBetaMatrix: File with beta-diversity matrix
+:type:	File stream or file path string
-		#Generate beta matrix
+"""
-		#Returns condensed matrix
-		tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True)
+# If they want all the sample count, return all sample names
+iSampleCount = len(npaAbundanceMatrix[:, 0])
-		if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
+if iSelectSampleCount == iSampleCount:
-			tempDistanceMatrix = tempDistanceMatrix[0]
+return lsSampleNames
-		if type(tempDistanceMatrix) is BooleanType:
+# Holds the samples to be returned
-			logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.")
+lsReturnSamplesRet = []
-			return False
+# Generate beta matrix
-		if istmBetaMatrix:
+# Returns condensed matrix
-			tempDistanceMatrix = 1-tempDistanceMatrix
+tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix, lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(
+npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse=True)
-		#Feed beta matrix to linkage to cluster
-		#Send condensed matrix
+if strBetaMetric in [Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted]:
-		linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod)
+tempDistanceMatrix = tempDistanceMatrix[0]
-		#Extract cluster information from dendrogram
+if type(tempDistanceMatrix) is BooleanType:
-		#The linakge matrix is of the form
+logging.error(
-		#[[int1 int2 doube int3],...]
+"MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.")
-		#int1 and int1 are the paired samples indexed at 0 and up.
+return False
-		#each list is an entry for a branch that is number starting with the first
-		#list being sample count index + 1
+if istmBetaMatrix:
-		#each list is then named by an increment as they appear
+tempDistanceMatrix = 1-tempDistanceMatrix
-		#this means that if a number is in the list and is = sample count or greater it is not
-		#terminal and is instead a branch.
+# Feed beta matrix to linkage to cluster
-		#This method just takes the lowest metric measurement (highest distance pairs/clusters)
+# Send condensed matrix
-		#Works much better than the original technique
+linkageMatrix = hcluster.linkage(
-		#get total number of samples
+tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod)
-		iCurrentSelectCount = 0
+# Extract cluster information from dendrogram
-		for row in linkageMatrix:
+# The linakge matrix is of the form
-			#Get nodes ofthe lowest pairing (so the furthest apart pair)
+# [[int1 int2 doube int3],...]
-			iNode1 = int(row[0])
+# int1 and int1 are the paired samples indexed at 0 and up.
-			iNode2 = int(row[1])
+# each list is an entry for a branch that is number starting with the first
-			#Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram
+# list being sample count index + 1
-			#The branching in the dendrogram will start at the number of samples and increment higher.
+# each list is then named by an increment as they appear
-			#Add each of the pair one at a time breaking when enough samples are selected.
+# this means that if a number is in the list and is = sample count or greater it is not
-			if iNode1<iSampleCount:
+# terminal and is instead a branch.
-				lsReturnSamplesRet.append(lsSampleNames[iNode1])
+# This method just takes the lowest metric measurement (highest distance pairs/clusters)
-				iCurrentSelectCount = iCurrentSelectCount + 1
+# Works much better than the original technique
-			if iCurrentSelectCount == iSelectSampleCount:
+# get total number of samples
-				break
-			if iNode2<iSampleCount:
+iCurrentSelectCount = 0
-				lsReturnSamplesRet.append(lsSampleNames[iNode2])
+for row in linkageMatrix:
-				iCurrentSelectCount = iCurrentSelectCount + 1
+# Get nodes ofthe lowest pairing (so the furthest apart pair)
-			if iCurrentSelectCount == iSelectSampleCount:
+iNode1 = int(row[0])
-				break
+iNode2 = int(row[1])
+# Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram
-		#Return selected samples
+# The branching in the dendrogram will start at the number of samples and increment higher.
-		return lsReturnSamplesRet
+# Add each of the pair one at a time breaking when enough samples are selected.
+if iNode1 < iSampleCount:
-	####Group 4## Rank Average of user Defined Taxa
+lsReturnSamplesRet.append(lsSampleNames[iNode1])
-		#Testing: Happy Path Tested
+iCurrentSelectCount = iCurrentSelectCount + 1
-	def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False):
+if iCurrentSelectCount == iSelectSampleCount:
-		"""
+break
-		Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped.
+if iNode2 < iSampleCount:
+lsReturnSamplesRet.append(lsSampleNames[iNode2])
-		:param	abndTable:	Abundance Table to analyse
+iCurrentSelectCount = iCurrentSelectCount + 1
-		:type:	AbundanceTable	Abundance Table
+if iCurrentSelectCount == iSelectSampleCount:
-		:param	lsTargetedFeature:	String names
+break
-		:type:	list	list of string names of features (bugs) which are measured after ranking against the full sample
-		:param  fRank:	Indicates to rank the abundance before getting the average abundance of the features (default false)
+# Return selected samples
-		:type:   boolean	Flag indicating ranking abundance before calculating average feature measurement (false= no ranking)
+return lsReturnSamplesRet
-		:return	List of lists or boolean:	List of lists or False on error. One internal list per sample indicating the sample,
-				feature average abundance or ranked abundance. Lists will already be sorted.
+# Group 4## Rank Average of user Defined Taxa
-				For not Ranked [[sample,average abundance of selected feature,1]]
+# Testing: Happy Path Tested
-				For Ranked [[sample,average ranked abundance, average abundance of selected feature]]
+def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False):
-				Error Returns false
+"""
-		"""
+Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped.
-		llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature)
+:param	abndTable:	Abundance Table to analyse
-		if not llAbundance:
+:type:	AbundanceTable	Abundance Table
-			logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
+:param	lsTargetedFeature:	String names
-			return False
+:type:	list	list of string names of features (bugs) which are measured after ranking against the full sample
-		#Add a space for ranking if needed
+:param  fRank:	Indicates to rank the abundance before getting the average abundance of the features (default false)
-		#Not ranked will be [[sSample,average abundance,1]]
+:type:   boolean	Flag indicating ranking abundance before calculating average feature measurement (false= no ranking)
-		#(where 1 will not discriminant ties if used in later functions, so this generalizes)
+:return	List of lists or boolean:	List of lists or False on error. One internal list per sample indicating the sample,
-		#Ranked will be [[sSample, average rank, average abundance]]
+feature average abundance or ranked abundance. Lists will already be sorted.
-		llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance]
+For not Ranked [[sample,average abundance of selected feature,1]]
-		#Rank if needed
+For Ranked [[sample,average ranked abundance, average abundance of selected feature]]
-		if fRank:
+Error Returns false
-			abndRanked = abndTable.funcRankAbundance()
+"""
-			if abndRanked == None:
-				logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.")
+llAbundance = abndTable.funcGetAverageAbundancePerSample(
-				return False
+lsTargetedFeature)
-			llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature)
+if not llAbundance:
-			if not llRetRank:
+logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
-				logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
+return False
-				return False
+# Add a space for ranking if needed
-			dictRanks = dict(llRetRank)
+# Not ranked will be [[sSample,average abundance,1]]
-			llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance]
+# (where 1 will not discriminant ties if used in later functions, so this generalizes)
+# Ranked will be [[sSample, average rank, average abundance]]
-		#Sort first for ties and then for the main feature
+llRetAbundance = [[llist[0], -1, llist[1]] for llist in llAbundance]
-		if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity:
+# Rank if needed
-			llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank)
+if fRank:
-		if fRank:
+abndRanked = abndTable.funcRankAbundance()
-			llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank)
+if abndRanked == None:
-		return llRetAbundance
+logging.error(
+"MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.")
-	#Testing: Happy Path Tested
+return False
-	def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]):
+llRetRank = abndRanked.funcGetAverageAbundancePerSample(
-	  """
+lsTargetedFeature)
-	  Selects samples with the highest ranks or abundance of targeted features.
+if not llRetRank:
-	  If ranked, select the highest abundance for tie breaking
+logging.error(
+"MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
-	  :param	abndMatrix:	Abundance table to analyse
+return False
-	  :type:	AbundanceTable	Abundance table
+dictRanks = dict(llRetRank)
-	  :param	lsTargetedTaxa:	List of features
+llRetAbundance = [[a[0], dictRanks[a[0]], a[2]]
-	  :type:	list	list of strings
+for a in llRetAbundance]
-	  :param	iSampleSelectionCount:	Number of samples to select
-	  :type:	integer	integer
+# Sort first for ties and then for the main feature
-	  :param	sMethod:	Method to select targeted features
+if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity:
-	  :type:	string	String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues)
+llRetAbundance = sorted(
-	  :return	List of strings:	List of sample names which were selected
+llRetAbundance, key=lambda sampleData: sampleData[2], reverse=not fRank)
-	  List of strings	Empty list is returned on an error.
+if fRank:
-	  """
+llRetAbundance = sorted(
+llRetAbundance, key=lambda sampleData: sampleData[1], reverse=not fRank)
-	  #Check data
+return llRetAbundance
-	  if(len(lsTargetedTaxa) < 1):
-		logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.")
+# Testing: Happy Path Tested
-		return []
+def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod=ConstantsMicropita.lsTargetedFeatureMethodValues[0]):
+"""
-	  lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa,
+Selects samples with the highest ranks or abundance of targeted features.
-	  	fRank=sMethod.lower() == self.c_strTargetedRanked.lower())
+If ranked, select the highest abundance for tie breaking
-	  #If an error occured or the key word for the method was not recognized
-	  if lsTargetedSamples == False:
+:param	abndMatrix:	Abundance table to analyse
-		  logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.")
+:type:	AbundanceTable	Abundance table
-		  return []
+:param	lsTargetedTaxa:	List of features
+:type:	list	list of strings
-	  #Select from results
+:param	iSampleSelectionCount:	Number of samples to select
-	  return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]]
+:type:	integer	integer
+:param	sMethod:	Method to select targeted features
-	####Group 5## Random
+:type:	string	String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues)
-	#Testing: Happy path Tested
+:return	List of strings:	List of sample names which were selected
-	def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0):
+List of strings	Empty list is returned on an error.
-		"""
+"""
-		Returns random sample names of the number given. No replacement.
+# Check data
-		:param	lsSamples:	List of sample names
+if(len(lsTargetedTaxa) < 1):
-		:type:	list	list of strings
+logging.error(
-		:param	iNumberOfSamplesToReturn:	Number of samples to select
+"MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.")
-		:type:	integer	integer.
+return []
-		:return	List:	List of selected samples (strings).
-		"""
+lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa,
+fRank=sMethod.lower() == self.c_strTargetedRanked.lower())
-		#Input matrix sample count
+# If an error occured or the key word for the method was not recognized
-		sampleCount = len(lsSamples)
+if lsTargetedSamples == False:
+logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.")
-		#Return the full matrix if they ask for a return matrix where length == original
+return []
-		if(iNumberOfSamplesToReturn >= sampleCount):
-			return lsSamples
+# Select from results
+return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]]
-		#Get the random indices for the sample (without replacement)
-		liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn)
+# Group 5## Random
+# Testing: Happy path Tested
-		#Create a boolean array of if indexes are to be included in the reduced array
+def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0):
-return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices]
+"""
+Returns random sample names of the number given. No replacement.
-	#Happy path tested (case 3)
-	def funcGetAveragePopulation(self, abndTable, lfCompress):
+:param	lsSamples:	List of sample names
-		"""
+:type:	list	list of strings
-		Get the average row per column in the abndtable.
+:param	iNumberOfSamplesToReturn:	Number of samples to select
+:type:	integer	integer.
-		:param abndTable: AbundanceTable of data to be averaged
+:return	List:	List of selected samples (strings).
-		:type: AbudanceTable
+"""
-		:param lfCompress: List of boolean flags (false means to remove sample before averaging
-		:type: List of floats
+# Input matrix sample count
-		:return List of doubles:
+sampleCount = len(lsSamples)
-		"""
-		if sum(lfCompress) == 0:
+# Return the full matrix if they ask for a return matrix where length == original
-			return []
+if(iNumberOfSamplesToReturn >= sampleCount):
+return lsSamples
-		#Get the average populations
-		lAverageRet = []
+# Get the random indices for the sample (without replacement)
+liRandomIndices = random.sample(
-		for sFeature in abndTable.funcGetAbundanceCopy():
+range(sampleCount), iNumberOfSamplesToReturn)
-			sFeature = list(sFeature)[1:]
-			sFeature=np.compress(lfCompress,sFeature,axis=0)
+# Create a boolean array of if indexes are to be included in the reduced array
-			lAverageRet.append(sum(sFeature)/float(len(sFeature)))
+return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices]
-		return lAverageRet
+# Happy path tested (case 3)
-	#Happy path tested (2 cases)
+def funcGetAveragePopulation(self, abndTable, lfCompress):
-	def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected):
+"""
-		"""
+Get the average row per column in the abndtable.
-		Given an abundance table and an average sample, this returns the distance of each sample
-		(measured using brays-curtis dissimilarity) from the average.
+:param abndTable: AbundanceTable of data to be averaged
-		The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected
+:type: AbudanceTable
-		(which is associated with the samples in the order of the samples in the abundance table;
+:param lfCompress: List of boolean flags (false means to remove sample before averaging
-		use abundancetable.funcGetSampleNames() to see the order if needed).
+:type: List of floats
+:return List of doubles:
-		:param abndTable: Abundance table holding the data to be analyzed.
+"""
-		:type: AbundanceTable
+if sum(lfCompress) == 0:
-		:param ldAverage: Average population (Average features of the abundance table of samples)
+return []
-		:type: List of doubles which represent the average population
-		:param lsSamples: These are the only samples used in the analysis
+# Get the average populations
-		:type: List of strings (sample ids)
+lAverageRet = []
-		:param lfSelected: Samples to be included in the analysis
-		:type: List of boolean (true means include)
+for sFeature in abndTable.funcGetAbundanceCopy():
-		:return: List of distances (doubles)
+sFeature = list(sFeature)[1:]
-		"""
+sFeature = np.compress(lfCompress, sFeature, axis=0)
-		#Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists
+lAverageRet.append(sum(sFeature)/float(len(sFeature)))
-		ldSelectedDistances = []
+return lAverageRet
-		for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]:
+# Happy path tested (2 cases)
-			#Get the sample measurements
+def funcGetDistanceFromAverage(self, abndTable, ldAverage, lsSamples, lfSelected):
-			ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0])
+"""
-		return ldSelectedDistances
+Given an abundance table and an average sample, this returns the distance of each sample
+(measured using brays-curtis dissimilarity) from the average.
-	#Happy path tested (1 case)
+The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected
-	def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther):
+(which is associated with the samples in the order of the samples in the abundance table;
-		"""
+use abundancetable.funcGetSampleNames() to see the order if needed).
-		Get the distance of samples from one label from the average sample of not the label.
-		Note: This assumes 2 classes.
+:param abndTable: Abundance table holding the data to be analyzed.
+:type: AbundanceTable
-		:param abndTable: Table of data to work out of.
+:param ldAverage: Average population (Average features of the abundance table of samples)
-		:type: Abundace Table
+:type: List of doubles which represent the average population
-		:param lfGroupOfInterest: Boolean indicator of the sample being in the first group.
+:param lsSamples: These are the only samples used in the analysis
-		:type: List of floats, true indicating an individual in the group of interest.
+:type: List of strings (sample ids)
-		:param lfGroupOther:	Boolean indicator of the sample being in the other group.
+:param lfSelected: Samples to be included in the analysis
-		:type:	List of floats, true indicating an individual in the
+:type: List of boolean (true means include)
-		:return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population]
+:return: List of distances (doubles)
-		"""
+"""
-		#Get all sample names
+# Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists
-		lsAllSamples = abndTable.funcGetSampleNames()
+ldSelectedDistances = []
-		#Get average populations
+for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]:
-		lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther)
+# Get the sample measurements
+ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(
-		#Get the distance from the average of the other label (label 1)
+np.array([abndTable.funcGetSample(sSampleName), ldAverage]))[0])
-		ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther,
+return ldSelectedDistances
-			lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest)
+# Happy path tested (1 case)
-		return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances)
+def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther):
+"""
-	#Happy path tested (1 test case)
+Get the distance of samples from one label from the average sample of not the label.
-	def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest):
+Note: This assumes 2 classes.
-		"""
-		Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group.
+:param abndTable: Table of data to work out of.
-		An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group.
+:type: Abundace Table
+:param lfGroupOfInterest: Boolean indicator of the sample being in the first group.
-		:params  abndTable: Abundance of measurements
+:type: List of floats, true indicating an individual in the group of interest.
-		:type: AbundanceTable
+:param lfGroupOther:	Boolean indicator of the sample being in the other group.
-		:params iSelectionCount: The number of samples selected per sample.
+:type:	List of floats, true indicating an individual in the
-		:type: Integer Integer greater than 0
+:return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population]
-		:params sLabel: ID of the metadata which is the supervised label
+"""
-		:type: String
+# Get all sample names
-		:params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest.
+lsAllSamples = abndTable.funcGetSampleNames()
-		:type: String found in the abundance table metadata row indicated by sLabel.
-		:return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]]
+# Get average populations
-		"""
+lAverageOther = self.funcGetAveragePopulation(
+abndTable=abndTable, lfCompress=lfGroupOther)
-		lsMetadata = abndTable.funcGetMetadata(sLabel)
-		#Other metadata values
+# Get the distance from the average of the other label (label 1)
-		lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest))
+ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther,
+lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest)
-		#Get boolean indicator of values of interest
-		lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata]
+return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup], ldSelectedDistances)
-#Get the distances of the items of interest from the other metadata values
+# Happy path tested (1 test case)
-		dictDistanceAverages = {}
+def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest):
-for sOtherLabel in lsUniqueOtherValues:
+"""
-			#Get boolean indicator of labels not of interest
+Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group.
-			lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata]
+An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group.
-			#Get the distances of data from two different groups to the average of the other
+:params  abndTable: Abundance of measurements
-			ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther))
+:type: AbundanceTable
+:params iSelectionCount: The number of samples selected per sample.
-			for sKey in ldValueDistances:
+:type: Integer Integer greater than 0
-				dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey]
+:params sLabel: ID of the metadata which is the supervised label
+:type: String
-		#Finish average by dividing by length of lsUniqueOtherValues
+:params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest.
-		ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages]
+:type: String found in the abundance table metadata row indicated by sLabel.
+:return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]]
-#Sort to extract extremes
+"""
-ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1))
+lsMetadata = abndTable.funcGetMetadata(sLabel)
-		#Get the closest and farthest distances
+# Other metadata values
-		ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount]
+lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest))
-		ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:]
+# Get boolean indicator of values of interest
-		#Remove the selected samples from the larger population of distances (better visualization)
+lfLabelsInterested = [sValueOfInterest ==
-		ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples]
+sValue for sValue in lsMetadata]
-		#Return discriminant tuples, distinct tuples, other tuples
+# Get the distances of the items of interest from the other metadata values
-		return [ltupleDiscriminantSamples, ltupleDistinctSamples,
+dictDistanceAverages = {}
-			   [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]]
+for sOtherLabel in lsUniqueOtherValues:
+# Get boolean indicator of labels not of interest
-	#Run the supervised method surrounding distance from centroids
+lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata]
-	#Happy path tested (3 test cases)
-	def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant,
+# Get the distances of data from two different groups to the average of the other
-						xOutputSupFile, xPredictSupFile, strSupervisedMetadata,
+ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(
-						iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False):
+abndTable, lfLabelsInterested, lfLabelsOther))
-		"""
-		Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group.
+for sKey in ldValueDistances:
+dictDistanceAverages[sKey] = ldValueDistances[sKey] + \
-		:param	abundanceTable:	AbundanceTable
+dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey]
-		:type:	AbudanceTable	Data to analyze
-		:param	fRunDistinct:	Run distinct selection method
+# Finish average by dividing by length of lsUniqueOtherValues
-		:type:	Boolean	boolean (true runs method)
+ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(
-		:param	fRunDiscriminant:	Run discriminant method
+len(lsUniqueOtherValues))) for sKey in dictDistanceAverages]
-		:type:	Boolean	boolean (true runs method)
-		:param	xOutputSupFile:	File output from supervised methods detailing data going into the method.
+# Sort to extract extremes
-		:type:	String or FileStream
+ltpleAverageDistances = sorted(
-		:param	xPredictSupFile:	File output from supervised methods distance results from supervised methods.
+ltpleAverageDistances, key=operator.itemgetter(1))
-		:type:	String or FileStream
-		:param strSupervisedMetadata:	The metadata that will be used to group samples.
+# Get the closest and farthest distances
-		:type:	String
+ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount]
-		:param	iSampleSupSelectionCount:	Number of samples to select
+ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:]
-		:type:	Integer	int sample selection count
-		:param lsOriginalSampleNames:	List of the sample names, order is important and should be preserved from the abundanceTable.
+# Remove the selected samples from the larger population of distances (better visualization)
-		:type:	List of samples
+ldSelected = [tpleSelected[0]
-		:param	fAppendFiles:	Indicates that output files already exist and appending is occuring.
+for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples]
-		:type:	Boolean
-		:return	Selected Samples:	A dictionary of selected samples by selection ID
+# Return discriminant tuples, distinct tuples, other tuples
-		Dictionary	{"Selection Method":["SampleID","SampleID"...]}
+return [ltupleDiscriminantSamples, ltupleDistinctSamples,
-		"""
+[tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]]
-		#Get labels and run one label against many
-		lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata)
+# Run the supervised method surrounding distance from centroids
-		dictlltpleDistanceMeasurements = {}
+# Happy path tested (3 test cases)
-		for sMetadataValue in set(lstrMetadata):
+def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant,
+xOutputSupFile, xPredictSupFile, strSupervisedMetadata,
-			#For now perform the selection here for the label of interest against the other labels
+iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles=False):
-			dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable,
+"""
-				iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue))
+Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group.
-		#Make expected output files for supervised methods
+:param	abundanceTable:	AbundanceTable
-		#1. Output file which is similar to an input file for SVMs
+:type:	AbudanceTable	Data to analyze
-		#2. Output file that is similar to the probabilitic output of a SVM (LibSVM)
+:param	fRunDistinct:	Run distinct selection method
-		#Manly for making output of supervised methods (Distance from Centroid) similar
+:type:	Boolean	boolean (true runs method)
-		#MicropitaVis needs some of these files
+:param	fRunDiscriminant:	Run discriminant method
-		if xOutputSupFile:
+:type:	Boolean	boolean (true runs method)
-			if fAppendFiles:
+:param	xOutputSupFile:	File output from supervised methods detailing data going into the method.
-				SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
+:type:	String or FileStream
-					lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
+:param	xPredictSupFile:	File output from supervised methods distance results from supervised methods.
-			else:
+:type:	String or FileStream
-				SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
+:param strSupervisedMetadata:	The metadata that will be used to group samples.
-					sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
+:type:	String
+:param	iSampleSupSelectionCount:	Number of samples to select
-		#Will contain the samples selected to return
+:type:	Integer	int sample selection count
-		#One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type
+:param lsOriginalSampleNames:	List of the sample names, order is important and should be preserved from the abundanceTable.
-		dictSelectedSamplesRet = dict()
+:type:	List of samples
-		for sKey, ltplDistances in dictlltpleDistanceMeasurements.items():
+:param	fAppendFiles:	Indicates that output files already exist and appending is occuring.
-			if fRunDistinct:
+:type:	Boolean
-				dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]])
+:return	Selected Samples:	A dictionary of selected samples by selection ID
-			if fRunDiscriminant:
+Dictionary	{"Selection Method":["SampleID","SampleID"...]}
-				dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]])
+"""
+# Get labels and run one label against many
-		if xPredictSupFile:
+lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata)
-			dictFlattenedDistances = dict()
+dictlltpleDistanceMeasurements = {}
-			[dictFlattenedDistances.setdefault(sKey, []).append(tple)
+for sMetadataValue in set(lstrMetadata):
-				for sKey, lltple in dictlltpleDistanceMeasurements.items()
-				for ltple in lltple for tple in ltple]
+# For now perform the selection here for the label of interest against the other labels
-			if fAppendFiles:
+dictlltpleDistanceMeasurements.setdefault(sMetadataValue, []).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable,
-				self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
+iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue))
-					dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
-			else:
+# Make expected output files for supervised methods
-				self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
+# 1. Output file which is similar to an input file for SVMs
-					dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
+# 2. Output file that is similar to the probabilitic output of a SVM (LibSVM)
-		return dictSelectedSamplesRet
+# Manly for making output of supervised methods (Distance from Centroid) similar
+# MicropitaVis needs some of these files
-	#Two happy path test cases
+if xOutputSupFile:
-	def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames):
+if fAppendFiles:
-		"""
+SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
-		Manages updating the predict file.
+lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
+else:
-		:param	xPredictSupFile: File that has predictions (distances) from the supervised method.
+SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
-		:type:	FileStream or String file path
+sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
-		:param	xInputLabelsFile: File that as input to the supervised methods.
-		:type:	FileStream or String file path
+# Will contain the samples selected to return
-		:param	dictltpleDistanceMeasurements:
+# One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type
-		:type:	Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
+dictSelectedSamplesRet = dict()
-		"""
+for sKey, ltplDistances in dictlltpleDistanceMeasurements.items():
+if fRunDistinct:
-		if not isinstance(xPredictSupFile, str):
+dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct, []).extend([
-			xPredictSupFile.close()
+ltple[0] for ltple in ltplDistances[1]])
-			xPredictSupFile = xPredictSupFile.name
+if fRunDiscriminant:
-		csvr = open(xPredictSupFile,'r')
+dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant, []).extend([
+ltple[0] for ltple in ltplDistances[0]])
-		f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
-		lsHeader = f.next()[1:]
+if xPredictSupFile:
-		dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader])
+dictFlattenedDistances = dict()
+[dictFlattenedDistances.setdefault(sKey, []).append(tple)
-		#Read data in
+for sKey, lltple in dictlltpleDistanceMeasurements.items()
-		iSampleIndex = 0
+for ltple in lltple for tple in ltple]
-		for sRow in f:
+if fAppendFiles:
-			sLabel = sRow[0]
+self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
-			[dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:])
+dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
-				if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue]
+else:
-			iSampleIndex += 1
+self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
+dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
-		#Combine dictltpleDistanceMeasurements with new data
+return dictSelectedSamplesRet
-		#If they share a key then merge keeping parameter data
-		#If they do not share the key, keep the full data
+# Two happy path test cases
-		dictNew = {}
+def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames):
-		for sKey in dictltpleDistanceMeasurements.keys():
+"""
-			lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]]
+Manages updating the predict file.
-			dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey]
-for sKey in dictlltpleRead:
+:param	xPredictSupFile: File that has predictions (distances) from the supervised method.
-			if sKey not in dictltpleDistanceMeasurements.keys():
+:type:	FileStream or String file path
-				dictNew[sKey] = dictlltpleRead[sKey]
+:param	xInputLabelsFile: File that as input to the supervised methods.
+:type:	FileStream or String file path
-		#Call writer
+:param	dictltpleDistanceMeasurements:
-		self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile,
+:type:	Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
-			dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable,
+"""
-			lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True)
+if not isinstance(xPredictSupFile, str):
-	#2 happy path test cases
+xPredictSupFile.close()
-def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False):
+xPredictSupFile = xPredictSupFile.name
-		"""
+csvr = open(xPredictSupFile, 'r')
-		Write to the predict file.
+f = csv.reader(
-		:param	xPredictSupFile: File that has predictions (distances) from the supervised method.
+csvr, delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
-		:type:	FileStream or String file path
+lsHeader = f.next()[1:]
-		:param	xInputLabelsFile: File that as input to the supervised methods.
+dictlltpleRead = dict([(sHeader, []) for sHeader in lsHeader])
-		:type:	FileStream or String file path
-		:param	dictltpleDistanceMeasurements:
+# Read data in
-		:type:	Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
+iSampleIndex = 0
-		:param	abundanceTable: An abundance table of the sample data.
+for sRow in f:
-		:type:	AbundanceTable
+sLabel = sRow[0]
-		:param	lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing.
+[dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex], dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:])
-			Otherwise will use the sample names from the abundance table.
+if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue]
-		:type:	List of strings
+iSampleIndex += 1
-		:param	fFromUpdate:	Indicates if this is part of an update to the file or not.
-		:type:	Boolean
+# Combine dictltpleDistanceMeasurements with new data
-		"""
+# If they share a key then merge keeping parameter data
+# If they do not share the key, keep the full data
-		xInputLabelsFileName = xInputLabelsFile
+dictNew = {}
-		if not isinstance(xInputLabelsFile,str):
+for sKey in dictltpleDistanceMeasurements.keys():
-			xInputLabelsFileName = xInputLabelsFile.name
+lsSamples = [tple[0]
-		f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
+for tple in dictltpleDistanceMeasurements[sKey]]
+dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0]
-		lsAllSampleNames = abundanceTable.funcGetSampleNames()
+not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey]
-		lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames,
+for sKey in dictlltpleRead:
-						isPredictFile=False)
+if sKey not in dictltpleDistanceMeasurements.keys():
-		dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]])
+dictNew[sKey] = dictlltpleRead[sKey]
-		#Dictionay keys will be used to order the predict file
+# Call writer
-		lsMeasurementKeys = dictltpleDistanceMeasurements.keys()
+self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile,
-		#Make header
+dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable,
-		f.writerow(["labels"]+lsMeasurementKeys)
+lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True)
-		#Reformat dictionary to make it easier to use
+# 2 happy path test cases
-		for sKey in dictltpleDistanceMeasurements:
+def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False):
-			dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]])
+"""
+Write to the predict file.
-		for sSample in lsOriginalSampleNames:
-			#Make body of file
+:param	xPredictSupFile: File that has predictions (distances) from the supervised method.
-			f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+
+:type:	FileStream or String file path
-				[str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue))
+:param	xInputLabelsFile: File that as input to the supervised methods.
-				for sKey in lsMeasurementKeys])
+:type:	FileStream or String file path
+:param	dictltpleDistanceMeasurements:
-	def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics,
+:type:	Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
-												fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None,
+:param	abundanceTable: An abundance table of the sample data.
-												istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False):
+:type:	AbundanceTable
-		"""
+:param	lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing.
-		Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other
+Otherwise will use the sample names from the abundance table.
-		for the set that should be normalized.
+:type:	List of strings
+:param	fFromUpdate:	Indicates if this is part of an update to the file or not.
-		:param	abndData:	Abundance table object holding the samples to be measured.
+:type:	Boolean
-		:type:	AbundanceTable
+"""
-		:param	iSampleSelectionCount	The number of samples to select per method.
-		:type:	Integer
+xInputLabelsFileName = xInputLabelsFile
-		:param	dictSelectedSamples	Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}.
+if not isinstance(xInputLabelsFile, str):
-		:type:	Dictionary
+xInputLabelsFileName = xInputLabelsFile.name
-		:param	lsAlphaMetrics:	List of alpha metrics to use on alpha metric dependent assays (like highest diversity).
+f = csv.writer(open(xPredictSupFile, "w") if isinstance(xPredictSupFile, str)
-		:type:	List of strings
+else xPredictSupFile, delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
-		:param	lsBetaMetrics:	List of beta metrics to use on beta metric dependent assays (like most representative).
-		:type:	List of strings
+lsAllSampleNames = abundanceTable.funcGetSampleNames()
-		:param	lsInverseBetaMetrics:	List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar).
+lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames=lsOriginalSampleNames if fFromUpdate else lsAllSampleNames,
-		:type:	List of strings
+isPredictFile=False)
-		:param	fRunDiversity:	Run Diversity based methods (true indicates run).
+dictLabels = dict([(sSample, sLabel) for sLabel in lsLabels.keys()
-		:type:	Boolean
+for sSample in lsLabels[sLabel]])
-		:param	fRunRepresentative:	Run Representative based methods (true indicates run).
-		:type:	Boolean
+# Dictionay keys will be used to order the predict file
-		:param	fRunExtreme:	Run Extreme based methods (true indicates run).
+lsMeasurementKeys = dictltpleDistanceMeasurements.keys()
-		:type:	Boolean
+# Make header
-		:param	istmBetaMatrix:	File that has a precalculated beta matrix
+f.writerow(["labels"]+lsMeasurementKeys)
-		:type:	File stream or File path string
-		:return	Selected Samples:	Samples selected by methods.
+# Reformat dictionary to make it easier to use
-				Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
+for sKey in dictltpleDistanceMeasurements:
-		"""
+dictltpleDistanceMeasurements[sKey] = dict(
+[ltpl for ltpl in dictltpleDistanceMeasurements[sKey]])
-		#Sample ids/names
-		lsSampleNames = abndData.funcGetSampleNames()
+for sSample in lsOriginalSampleNames:
+# Make body of file
-		#Generate alpha metrics and get most diverse
+f.writerow([dictLabels.get(sSample, ConstantsMicropita.c_sEmptyPredictFileValue)] +
-		if fRunDiversity:
+[str(dictltpleDistanceMeasurements[sKey].get(sSample, ConstantsMicropita.c_sEmptyPredictFileValue))
+for sKey in lsMeasurementKeys])
-			#Get Alpha metrics matrix
-			internalAlphaMatrix = None
+def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics,
-			#Name of technique
+fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None,
-			strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics
+istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False):
+"""
-			#If given an alpha-diversity metadata
+Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other
-			if strAlphaMetadata:
+for the set that should be normalized.
-				internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]]
-			else:
+:param	abndData:	Abundance table object holding the samples to be measured.
-				#Expects Observations (Taxa (row) x sample (column))
+:type:	AbundanceTable
-				#Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
+:param	iSampleSelectionCount	The number of samples to select per method.
-				internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy()
+:type:	Integer
-							if not abndData.funcIsSummed()
+:param	dictSelectedSamples	Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}.
-							else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(),
+:type:	Dictionary
-							lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics)
+:param	lsAlphaMetrics:	List of alpha metrics to use on alpha metric dependent assays (like highest diversity).
+:type:	List of strings
-			if internalAlphaMatrix:
+:param	lsBetaMetrics:	List of beta metrics to use on beta metric dependent assays (like most representative).
-				#Invert measurments
+:type:	List of strings
-				if fInvertDiversity:
+:param	lsInverseBetaMetrics:	List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar).
-					lldNewDiversity = []
+:type:	List of strings
-					for lsLine in internalAlphaMatrix:
+:param	fRunDiversity:	Run Diversity based methods (true indicates run).
-						lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine])
+:type:	Boolean
-					internalAlphaMatrix = lldNewDiversity
+:param	fRunRepresentative:	Run Representative based methods (true indicates run).
-				#Get top ranked alpha diversity by most diverse
+:type:	Boolean
-				#Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...]
+:param	fRunExtreme:	Run Extreme based methods (true indicates run).
-				#Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]]
+:type:	Boolean
-				mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount)
+:param	istmBetaMatrix:	File that has a precalculated beta matrix
+:type:	File stream or File path string
-				#Add to results
+:return	Selected Samples:	Samples selected by methods.
-				for index in xrange(0,len(strMethod)):
+Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
-					strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index])
+"""
-					dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index])
+# Sample ids/names
-		logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b")
+lsSampleNames = abndData.funcGetSampleNames()
-		logging.info(dictSelectedSamples)
+# Generate alpha metrics and get most diverse
-		#Generate beta metrics and
+if fRunDiversity:
-		if fRunRepresentative or fRunExtreme:
+# Get Alpha metrics matrix
-			#Abundance matrix transposed
+internalAlphaMatrix = None
-			npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True)
+# Name of technique
+strMethod = [
-			#Get center selection using clusters/tiling
+strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics
-			#This will be for beta metrics in normalized space
-			if fRunRepresentative:
+# If given an alpha-diversity metadata
+if strAlphaMetadata:
-				if istmBetaMatrix:
+internalAlphaMatrix = [
-					#Get representative dissimilarity samples
+[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]]
-					medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
+else:
+# Expects Observations (Taxa (row) x sample (column))
-					if medoidSamples:
+#Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
-						dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples)
+internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance=abndData.funcGetAbundanceCopy()
-				else:
+if not abndData.funcIsSummed()
-					logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.")
+else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(),
-					for bMetric in lsBetaMetrics:
+lsSampleNames=lsSampleNames, lsDiversityMetricAlpha=lsAlphaMetrics)
-						#Get representative dissimilarity samples
+if internalAlphaMatrix:
-						medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
+# Invert measurments
+if fInvertDiversity:
-						if medoidSamples:
+lldNewDiversity = []
-							dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples)
+for lsLine in internalAlphaMatrix:
+lldNewDiversity.append(
-			#Get extreme selection using clusters, tiling
+[1/max(dValue, ConstantsMicropita.c_smallNumber) for dValue in lsLine])
-			if fRunExtreme:
+internalAlphaMatrix = lldNewDiversity
-				logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.")
+# Get top ranked alpha diversity by most diverse
-				if istmBetaMatrix:
+# Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...]
+#Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]]
-					#Samples for representative dissimilarity
+mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(
-					#This involves inverting the distance metric,
+lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount)
-					#Taking the dendrogram level of where the number cluster == the number of samples to select
-					#Returning a repersentative sample from each cluster
+# Add to results
-					extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
+for index in xrange(0, len(strMethod)):
+strSelectionMethod = self.dictConvertAMetricDiversity.get(
-					#Add selected samples
+strMethod[index], ConstantsMicropita.c_strDiversity+"="+strMethod[index])
-					if extremeSamples:
+dictSelectedSamples.setdefault(strSelectionMethod, []).extend(
-						dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples)
+mostDiverseAlphaSamplesIndexes[index])
-				else:
+logging.info(
-					#Run KMedoids with inverse custom distance metric in normalized space
+"MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b")
-					for bMetric in lsInverseBetaMetrics:
+logging.info(dictSelectedSamples)
-						#Samples for representative dissimilarity
+# Generate beta metrics and
-						#This involves inverting the distance metric,
+if fRunRepresentative or fRunExtreme:
-						#Taking the dendrogram level of where the number cluster == the number of samples to select
-						#Returning a repersentative sample from each cluster
+# Abundance matrix transposed
-						extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
+npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(
+abndData.funcGetAbundanceCopy(), fRemoveAdornments=True)
-						#Add selected samples
-						if extremeSamples:
+# Get center selection using clusters/tiling
-							dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples)
+# This will be for beta metrics in normalized space
+if fRunRepresentative:
-		logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b")
-		logging.info(dictSelectedSamples)
+if istmBetaMatrix:
-		return dictSelectedSamples
+# Get representative dissimilarity samples
+medoidSamples = self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames,
-	def funcRun(self, strIDName, strLastMetadataName, istmInput,
+iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
-					  ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput,
-					  cDelimiter, cFeatureNameDelimiter, strFeatureSelection,
+if medoidSamples:
-					  istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None,
+dictSelectedSamples.setdefault(
-					  strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None,
+ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom, []).extend(medoidSamples)
-					  iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False):
+else:
-		"""
+logging.info(
-		Manages the selection of samples given different metrics.
+"MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.")
+for bMetric in lsBetaMetrics:
-		:param	strIDName: Sample Id metadata row
-		:type:	String
+# Get representative dissimilarity samples
-		:param	strLastMetadataName: The id of the metadata positioned last in the abundance table.
+medoidSamples = self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames,
-		:type:	String	String metadata id.
+iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
-		:param	istmInput: File to store input data to supervised methods.
-		:type:	FileStream of String file path
+if medoidSamples:
-		:param	ostmInputPredictFile: File to store distances from supervised methods.
+dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(
-		:type:	FileStream or String file path
+bMetric, ConstantsMicropita.c_strRepresentative+"="+bMetric), []).extend(medoidSamples)
-		:param	ostmCheckedFile: File to store the AbundanceTable data after it is being checked.
-		:type:	FileStream or String file path
+# Get extreme selection using clusters, tiling
-		:param	ostmOutPut: File to store sample selection by methods of interest.
+if fRunExtreme:
-		:type:	FileStream or String file path
+logging.info(
-		:param	cDelimiter: Delimiter of abundance table.
+"MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.")
-		:type:	Character Char (default TAB).
+if istmBetaMatrix:
-		:param	cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades).
-		:type:	Character (default |).
+# Samples for representative dissimilarity
-		:param	stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance).
+# This involves inverting the distance metric,
-		:type:	String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues).
+# Taking the dendrogram level of where the number cluster == the number of samples to select
-		:param	istmFeatures: File which holds the features of interest if using targeted feature methodology.
+# Returning a repersentative sample from each cluster
-		:type:	FileStream or String file path
+extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance,
-		:param	iCount:	Number of samples to select in each methods, supervised methods select this amount per label if possible.
+lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
-		:type:	Integer	integer.
-		:param	lstrMethods: List of strings indicating selection techniques.
+# Add selected samples
-		:type:	List of string method names
+if extremeSamples:
-		:param	strLabel: The metadata used for supervised labels.
+dictSelectedSamples.setdefault(
-		:type:	String
+ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom, []).extend(extremeSamples)
-		:param	strStratify: The metadata used to stratify unsupervised data.
-		:type:	String
+else:
-		:param	strCustomAlpha: Custom alpha diversity metric
+# Run KMedoids with inverse custom distance metric in normalized space
-		:type:	String
+for bMetric in lsInverseBetaMetrics:
-		:param	strCustomBeta: Custom beta diversity metric
-		:type:	String
+# Samples for representative dissimilarity
-		:param	strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling
+# This involves inverting the distance metric,
-		:type:	String
+# Taking the dendrogram level of where the number cluster == the number of samples to select
-		:param	istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling
+# Returning a repersentative sample from each cluster
-		:type:	FileStream or String file path
+extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames,
-		:param	istrmTree: File containing tree for phylogentic beta-diversity analysis
+iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
-		:type:	FileStream or String file path
-		:param	istrmEnvr: File containing environment for phylogentic beta-diversity analysis
+# Add selected samples
-		:type:	FileStream or String file path
+if extremeSamples:
-		:param	iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples.
+dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(
-		:type:	Integer
+bMetric, ConstantsMicropita.c_strExtreme+"="+bMetric), []).extend(extremeSamples)
-		:param	iMinSamples: Minimum sample count for the occurence filter.
-		:type:	Integer
+logging.info(
-		:param	fInvertDiversity: When true will invert diversity measurements before using.
+"MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b")
-		:type:	boolean
+logging.info(dictSelectedSamples)
-		:return	Selected Samples:	Samples selected by methods.
+return dictSelectedSamples
-				Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
-		"""
+def funcRun(self, strIDName, strLastMetadataName, istmInput,
+ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput,
-		#Holds the top ranked samples from different metrics
+cDelimiter, cFeatureNameDelimiter, strFeatureSelection,
-		#dict[metric name] = [samplename,samplename...]
+istmFeatures, iCount, lstrMethods, strLastRowMetadata=None, strLabel=None, strStratify=None,
-		selectedSamples = dict()
+strCustomAlpha=None, strCustomBeta=None, strAlphaMetadata=None, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None,
+iMinSeqs=ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples=ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity=False):
-		#If a target feature file is given make sure that targeted feature is in the selection methods, if not add
+"""
-		if ConstantsMicropita.c_strFeature in lstrMethods:
+Manages the selection of samples given different metrics.
-		  if not istmFeatures:
-			logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.")
+:param	strIDName: Sample Id metadata row
-			return False
+:type:	String
+:param	strLastMetadataName: The id of the metadata positioned last in the abundance table.
-		#Diversity metrics to run
+:type:	String	String metadata id.
-		#Use custom metrics if specified
+:param	istmInput: File to store input data to supervised methods.
-#Custom beta metrics set to normalized only, custom alpha metrics set to count only
+:type:	FileStream of String file path
-		diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity]
+:param	ostmInputPredictFile: File to store distances from supervised methods.
-		diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity]
+:type:	FileStream or String file path
+:param	ostmCheckedFile: File to store the AbundanceTable data after it is being checked.
+:type:	FileStream or String file path
+:param	ostmOutPut: File to store sample selection by methods of interest.
+:type:	FileStream or String file path
+:param	cDelimiter: Delimiter of abundance table.
+:type:	Character Char (default TAB).
+:param	cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades).
+:type:	Character (default |).
+:param	stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance).
+:type:	String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues).
+:param	istmFeatures: File which holds the features of interest if using targeted feature methodology.
+:type:	FileStream or String file path
+:param	iCount:	Number of samples to select in each methods, supervised methods select this amount per label if possible.
+:type:	Integer	integer.
+:param	lstrMethods: List of strings indicating selection techniques.
+:type:	List of string method names
+:param	strLabel: The metadata used for supervised labels.
+:type:	String
+:param	strStratify: The metadata used to stratify unsupervised data.
+:type:	String
+:param	strCustomAlpha: Custom alpha diversity metric
+:type:	String
+:param	strCustomBeta: Custom beta diversity metric
+:type:	String
+:param	strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling
+:type:	String
+:param	istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling
+:type:	FileStream or String file path
+:param	istrmTree: File containing tree for phylogentic beta-diversity analysis
+:type:	FileStream or String file path
+:param	istrmEnvr: File containing environment for phylogentic beta-diversity analysis
+:type:	FileStream or String file path
+:param	iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples.
+:type:	Integer
+:param	iMinSamples: Minimum sample count for the occurence filter.
+:type:	Integer
+:param	fInvertDiversity: When true will invert diversity measurements before using.
+:type:	boolean
+:return	Selected Samples:	Samples selected by methods.
+Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
+"""
+# Holds the top ranked samples from different metrics
+# dict[metric name] = [samplename,samplename...]
+selectedSamples = dict()
+# If a target feature file is given make sure that targeted feature is in the selection methods, if not add
+if ConstantsMicropita.c_strFeature in lstrMethods:
+if not istmFeatures:
+logging.error(
+"MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.")
+return False
+# Diversity metrics to run
+# Use custom metrics if specified
+# Custom beta metrics set to normalized only, custom alpha metrics set to count only
+diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [
+MicroPITA.c_strInverseSimpsonDiversity]
+diversityMetricsBeta = [] if istmBetaMatrix else [
+strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity]
 #		inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity]
-		diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else []
+diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [
-		diversityMetricsBetaNoNormalize = []
+strCustomAlpha] if strCustomAlpha else []
+diversityMetricsBetaNoNormalize = []
 #		inverseDiversityMetricsBetaNoNormalize = []
-		#Targeted taxa
+# Targeted taxa
-		userDefinedTaxa = []
+userDefinedTaxa = []
-		#Perform different flows flags
+# Perform different flows flags
-		c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods
+c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods
-		c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods
+c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods
-		c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods
+c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods
-		c_RUN_RANK_AVERAGE_USER_4 = False
+c_RUN_RANK_AVERAGE_USER_4 = False
-		if ConstantsMicropita.c_strFeature in lstrMethods:
+if ConstantsMicropita.c_strFeature in lstrMethods:
-			c_RUN_RANK_AVERAGE_USER_4 = True
+c_RUN_RANK_AVERAGE_USER_4 = True
-			if not istmFeatures:
+if not istmFeatures:
-				logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.")
+logging.error(
-				return False
+"MicroPITA.funcRun:: No taxa file was given for taxa selection.")
-			#Read in taxa list, break down to lines and filter out empty strings
+return False
-			userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines()))
+# Read in taxa list, break down to lines and filter out empty strings
-		c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods
+userDefinedTaxa = filter(None, (s.strip()
-		c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods
+for s in istmFeatures.readlines()))
-		c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods
+c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods
+c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods
-		#Read in abundance data
+c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods
-		#Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0
-		#Abundance table object to read in and manage data
+# Read in abundance data
-		totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples],
+# Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0
-								cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata,
+# Abundance table object to read in and manage data
-								sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile)
+totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter=[iMinSeqs, iMinSamples],
-		if not totalAbundanceTable:
+cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata,
-			logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+
+sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile)
-				" This often occurs when the Last Metadata is not specified correctly."+
+if not totalAbundanceTable:
-				" Please check to make sure the Last Metadata selection is the row of the last metadata,"+
+logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed." +
-				" all values after this selection should be microbial measurements and should be numeric.")
+" This often occurs when the Last Metadata is not specified correctly." +
-			return False
+" Please check to make sure the Last Metadata selection is the row of the last metadata," +
+" all values after this selection should be microbial measurements and should be numeric.")
-		lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel
+return False
-		dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy()
+lsOriginalLabels = SVM.funcMakeLabels(
-		logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata))
+totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel
-		#If there is only 1 unique value for the labels, do not run the Supervised methods
-		if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ):
+dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy()
-			logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[])))
+logging.debug("MicroPITA.funcRun:: Received metadata=" +
-			return False
+str(dictTotalMetadata))
+# If there is only 1 unique value for the labels, do not run the Supervised methods
-		#Run unsupervised methods###
+if strLabel and (len(set(dictTotalMetadata.get(strLabel, []))) < 2):
-		#Stratify the data if need be and drop the old data
+logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" +
-		lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable]
+str(dictTotalMetadata.get(strLabel, [])))
+return False
-		#For each stratified abundance block or for the unstratfified abundance
-		#Run the unsupervised blocks
+#Run unsupervised methods###
-		fAppendSupFiles = False
+# Stratify the data if need be and drop the old data
-		for stratAbundanceTable in lStratifiedAbundanceTables:
+lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(
-			logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName())
+strStratify) if strStratify else [totalAbundanceTable]
-			###NOT SUMMED, NOT NORMALIZED
+# For each stratified abundance block or for the unstratfified abundance
-			#Only perform if the data is not yet normalized
+# Run the unsupervised blocks
-			if not stratAbundanceTable.funcIsNormalized( ):
+fAppendSupFiles = False
-				#Need to first work with unnormalized data
+for stratAbundanceTable in lStratifiedAbundanceTables:
-				if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
+logging.info("MicroPITA.funcRun:: Running abundance block:" +
+stratAbundanceTable.funcGetName())
-					self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
-													 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize,
+# NOT SUMMED, NOT NORMALIZED
-													 lsBetaMetrics=diversityMetricsBetaNoNormalize,
+# Only perform if the data is not yet normalized
-													 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize,
+if not stratAbundanceTable.funcIsNormalized():
-													 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
+# Need to first work with unnormalized data
-													 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata,
+if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
-istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
+self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
+dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize,
-			#Generate selection by the rank average of user defined taxa
+lsBetaMetrics=diversityMetricsBetaNoNormalize,
-			#Expects (Taxa (row) by Samples (column))
+lsInverseBetaMetrics=diversityMetricsBetaNoNormalize,
-			#Expects a column 0 of taxa id that is skipped
+fRunDiversity=c_RUN_MAX_DIVERSITY_1, fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
-			#Returns [(sample name,average,rank)]
+fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata,
-			#SUMMED AND NORMALIZED
+istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
-			stratAbundanceTable.funcSumClades()
-			#Normalize data at this point
+# Generate selection by the rank average of user defined taxa
-			stratAbundanceTable.funcNormalize()
+# Expects (Taxa (row) by Samples (column))
-			if c_RUN_RANK_AVERAGE_USER_4:
+# Expects a column 0 of taxa id that is skipped
-				selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable,
+# Returns [(sample name,average,rank)]
-						lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection)
+# SUMMED AND NORMALIZED
-				logging.info("MicroPITA.funcRun:: Selected Samples Rank")
+stratAbundanceTable.funcSumClades()
-				logging.info(selectedSamples)
+# Normalize data at this point
+stratAbundanceTable.funcNormalize()
-			###SUMMED AND NORMALIZED analysis block
+if c_RUN_RANK_AVERAGE_USER_4:
-			#Diversity based metric will move reduce to terminal taxa as needed
+selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable,
-			if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
+lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection)
+logging.info("MicroPITA.funcRun:: Selected Samples Rank")
-				self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
+logging.info(selectedSamples)
-												 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha,
-												 lsBetaMetrics=diversityMetricsBeta,
+# SUMMED AND NORMALIZED analysis block
-												 lsInverseBetaMetrics=diversityMetricsBeta,
+# Diversity based metric will move reduce to terminal taxa as needed
-												 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
+if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
-												 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3,
-istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
+self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
+dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha,
-			#5::Select randomly
+lsBetaMetrics=diversityMetricsBeta,
-			#Expects sampleNames = List of sample names [name, name, name...]
+lsInverseBetaMetrics=diversityMetricsBeta,
-			if(c_RUN_RANDOM_5):
+fRunDiversity=c_RUN_MAX_DIVERSITY_1, fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
-				#Select randomly from sample names
+fRunExtreme=c_RUN_MAX_DISSIMILARITY_3,
-				selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount)
+istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
-				logging.info("MicroPITA.funcRun:: Selected Samples Random")
-				logging.info(selectedSamples)
+# 5::Select randomly
+# Expects sampleNames = List of sample names [name, name, name...]
-			#Perform supervised selection
+if(c_RUN_RANDOM_5):
-			if c_RUN_DISTINCT or c_RUN_DISCRIMINANT:
+# Select randomly from sample names
-				if strLabel:
+selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(
-					dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable,
+lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount)
-								fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT,
+logging.info("MicroPITA.funcRun:: Selected Samples Random")
-								xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile,
+logging.info(selectedSamples)
-								strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount,
-								lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(),
+# Perform supervised selection
-								lsOriginalLabels = lsOriginalLabels,
+if c_RUN_DISTINCT or c_RUN_DISCRIMINANT:
-								fAppendFiles=fAppendSupFiles)
+if strLabel:
+dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable,
-					[selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()]
+fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT,
+xOutputSupFile=ostmInputPredictFile, xPredictSupFile=ostmPredictFile,
-					if not fAppendSupFiles:
+strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount,
-						fAppendSupFiles = True
+lsOriginalSampleNames=totalAbundanceTable.funcGetSampleNames(),
-					logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised")
+lsOriginalLabels=lsOriginalLabels,
-					logging.info(selectedSamples)
+fAppendFiles=fAppendSupFiles)
-		return selectedSamples
+[selectedSamples.setdefault(sKey, []).extend(
-	#Testing: Happy path tested
+lValue) for sKey, lValue in dictSelectionRet.items()]
-	@staticmethod
-	def funcWriteSelectionToFile(dictSelection,xOutputFilePath):
+if not fAppendSupFiles:
-		"""
+fAppendSupFiles = True
-		Writes the selection of samples by method to an output file.
+logging.info(
+"MicroPITA.funcRun:: Selected Samples Unsupervised")
-		:param	dictSelection:	The dictionary of selections by method to be written to a file.
+logging.info(selectedSamples)
-		:type:	Dictionary	The dictionary of selections by method {"method":["sample selected","sample selected"...]}
+return selectedSamples
-		:param	xOutputFilePath:	FileStream or String path to file inwhich the dictionary is written.
-		:type:	String	FileStream or String path to file
+# Testing: Happy path tested
-		"""
+@staticmethod
+def funcWriteSelectionToFile(dictSelection, xOutputFilePath):
-		if not dictSelection:
+"""
-			return
+Writes the selection of samples by method to an output file.
-		#Open file
+:param	dictSelection:	The dictionary of selections by method to be written to a file.
-		f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim )
+:type:	Dictionary	The dictionary of selections by method {"method":["sample selected","sample selected"...]}
+:param	xOutputFilePath:	FileStream or String path to file inwhich the dictionary is written.
-		#Create output content from dictionary
+:type:	String	FileStream or String path to file
-		for sKey in dictSelection:
+"""
-			f.writerow([sKey]+dictSelection[sKey])
-			logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey]))
+if not dictSelection:
+return
-	#Testing: Happy Path tested
-	@staticmethod
+# Open file
-	def funcReadSelectionFileToDictionary(xInputFile):
+f = csv.writer(open(xOutputFilePath, "w") if isinstance(
-		"""
+xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim)
-		Reads in an output selection file from micropita and formats it into a dictionary.
+# Create output content from dictionary
-		:param	xInputFile:	String path to file or file stream to read and translate into a dictionary.
+for sKey in dictSelection:
-									{"method":["sample selected","sample selected"...]}
+f.writerow([sKey]+dictSelection[sKey])
-		:type:	FileStream or String Path to file
+logging.debug(
-		:return	Dictionary:	Samples selected by methods.
+"MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey]))
-					Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
-		"""
+# Testing: Happy Path tested
+@staticmethod
-		#Open file
+def funcReadSelectionFileToDictionary(xInputFile):
-		istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim)
+"""
+Reads in an output selection file from micropita and formats it into a dictionary.
-		#Dictionary to hold selection data
-		return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader])
+:param	xInputFile:	String path to file or file stream to read and translate into a dictionary.
+{"method":["sample selected","sample selected"...]}
-#Set up arguments reader
+:type:	FileStream or String Path to file
-argp = argparse.ArgumentParser( prog = "MicroPITA.py",
+:return	Dictionary:	Samples selected by methods.
-	description = """Selects samples from abundance tables based on various selection schemes.""" )
+Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
+"""
-args = argp.add_argument_group( "Common", "Commonly modified options" )
-args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp)
+# Open file
-args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp,
+istmReader = csv.reader(open(xInputFile, 'r') if isinstance(
-	choices = ConstantsMicropita.c_lsAllMethods, action = "append")
+xInputFile, str) else xInputFile, delimiter=ConstantsMicropita.c_outputFileDelim)
-args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" )
+# Dictionary to hold selection data
-args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp,  choices = Metric.setAlphaDiversities)
+return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader])
-args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp,  choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted])
-args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp)
-args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp)
+# Set up arguments reader
-args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp)
+argp = argparse.ArgumentParser(prog="MicroPITA.py",
-args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp)
+description="""Selects samples from abundance tables based on various selection schemes.""")
-args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp)
+args = argp.add_argument_group("Common", "Commonly modified options")
-args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" )
+args.add_argument(ConstantsMicropita.c_strCountArgument, "--num", dest="iCount",
-args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp)
+metavar="samples", default=10, type=int, help=ConstantsMicropita.c_strCountHelp)
-args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None,
+args.add_argument("-m", "--method", dest="lstrMethods", metavar="method", default=[], help=ConstantsMicropita.c_strSelectionTechniquesHelp,
-				  help= ConstantsMicropita.c_strLastMetadataNameHelp)
+choices=ConstantsMicropita.c_lsAllMethods, action="append")
-args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0],
-				  choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp)
+args = argp.add_argument_group(
-args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp)
+"Custom", "Selecting and inputing custom metrics")
-args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp)
+args.add_argument("-a", "--alpha", dest="strAlphaDiversity", metavar="AlphaDiversity", default=None,
+help=ConstantsMicropita.c_strCustomAlphaDiversityHelp,  choices=Metric.setAlphaDiversities)
-args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" )
+args.add_argument("-b", "--beta", dest="strBetaDiversity", metavar="BetaDiversity", default=None, help=ConstantsMicropita.c_strCustomBetaDiversityHelp,
-args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp)
+choices=list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted])
-args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id",
+args.add_argument("-q", "--alphameta", dest="strAlphaMetadata", metavar="AlphaDiversityMetadata",
-				  help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp)
+default=None, help=ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp)
+args.add_argument("-x", "--betamatrix", dest="istmBetaMatrix", metavar="BetaDiversityMatrix",
-args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" )
+default=None, help=ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp)
-args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp)
+args.add_argument("-o", "--tree", dest="istrmTree", metavar="PhylogeneticTree",
-args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp)
+default=None, help=ConstantsMicropita.c_strCustomPhylogeneticTreeHelp)
+args.add_argument("-i", "--envr", dest="istrmEnvr", metavar="EnvironmentFile",
-args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" )
+default=None, help=ConstantsMicropita.c_strCustomEnvironmentFileHelp)
-args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING",
+args.add_argument("-f", "--invertDiversity", dest="fInvertDiversity", action="store_true",
-				  choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp)
+default=False, help=ConstantsMicropita.c_strInvertDiversityHelp)
-args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp)
-args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp)
+args = argp.add_argument_group(
-args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp)
+"Miscellaneous", "Row/column identifiers and feature targeting options")
-args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp)
+args.add_argument("-d", ConstantsMicropita.c_strIDNameArgument, dest="strIDName",
+metavar="sample_id", help=ConstantsMicropita.c_strIDNameHelp)
-argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp,
+args.add_argument("-l", ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar="metadata_id", default=None,
-	default = sys.stdin)
+help=ConstantsMicropita.c_strLastMetadataNameHelp)
-argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp,
+args.add_argument("-r", ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0],
-	default = sys.stdout)
+choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help=ConstantsMicropita.c_strTargetedFeatureMethodHelp)
+args.add_argument("-t", ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures",
-__doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__
+metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp)
+args.add_argument("-w", ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata",
-def _main( ):
+metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp)
-	args = argp.parse_args( )
+args = argp.add_argument_group(
-	#Set up logger
+"Data labeling", "Metadata IDs for strata and supervised label values")
-	iLogLevel = getattr(logging, args.strLogLevel.upper(), None)
+args.add_argument("-e", ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel",
-	logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel)
+metavar="supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp)
+args.add_argument("-s", ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id",
-	#Run micropita
+help=ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp)
-	logging.info("MicroPITA:: Start microPITA")
-	microPITA = MicroPITA()
+args = argp.add_argument_group(
+"File formatting", "Rarely modified file formatting options")
-	#Argparse will append to the default but will not remove the default so I do this here
+args.add_argument("-j", ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter",
-	if not len(args.lstrMethods):
+metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp)
-		args.lstrMethods = [ConstantsMicropita.c_strRepresentative]
+args.add_argument("-k", ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter",
+metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp)
-	dictSelectedSamples = microPITA.funcRun(
-		strIDName		= args.strIDName,
+args = argp.add_argument_group(
-		strLastMetadataName	= args.strLastMetadataName,
+"Debugging", "Debugging options - modify at your own risk!")
-		istmInput		= args.istmInput,
+args.add_argument("-v", ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar="log_level", default="WARNING",
-		ostmInputPredictFile	= args.ostmInputPredictFile,
+choices=ConstantsMicropita.c_lsLoggingChoices, help=ConstantsMicropita.c_strLoggingHelp)
-		ostmPredictFile		= args.ostmPredictFile,
+args.add_argument("-c", ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile",
-		ostmCheckedFile		= args.ostmCheckedFile,
+metavar="output_qc", type=argparse.FileType("w"), help=ConstantsMicropita.c_strCheckedAbundanceFileHelp)
-		ostmOutput		= args.ostmOutput,
+args.add_argument("-g", ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile",
-		cDelimiter		= args.cFileDelimiter,
+metavar="output_log", type=argparse.FileType("w"), help=ConstantsMicropita.c_strLoggingFileHelp)
-		cFeatureNameDelimiter	= args.cFeatureNameDelimiter,
+args.add_argument("-u", ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile",
-		istmFeatures		= args.istmFeatures,
+metavar="output_scaled", type=argparse.FileType("w"), help=ConstantsMicropita.c_strSupervisedInputFileHelp)
-		strFeatureSelection	= args.strFeatureSelection,
+args.add_argument("-p", ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile",
-		iCount			= args.iCount,
+metavar="output_labels", type=argparse.FileType("w"), help=ConstantsMicropita.c_strSupervisedPredictedFileHelp)
-		strLastRowMetadata	= args.strLastFeatureMetadata,
-		strLabel		= args.strLabel,
+argp.add_argument("istmInput", metavar="input.pcl/biome", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strAbundanceFileHelp,
-		strStratify		= args.strUnsupervisedStratify,
+default=sys.stdin)
-		strCustomAlpha		= args.strAlphaDiversity,
+argp.add_argument("ostmOutput", metavar="output.txt", type=argparse.FileType("w"), help=ConstantsMicropita.c_strGenericOutputDataFileHelp,
-		strCustomBeta		= args.strBetaDiversity,
+default=sys.stdout)
-		strAlphaMetadata	= args.strAlphaMetadata,
-		istmBetaMatrix		= args.istmBetaMatrix,
+__doc__ = "::\n\n\t" + argp.format_help().replace("\n", "\n\t") + __doc__
-		istrmTree		= args.istrmTree,
-		istrmEnvr		= args.istrmEnvr,
-		lstrMethods		= args.lstrMethods,
+def _main():
-		fInvertDiversity	= args.fInvertDiversity
+args = argp.parse_args()
-	)
+# Set up logger
-	if not dictSelectedSamples:
+iLogLevel = getattr(logging, args.strLogLevel.upper(), None)
-		logging.error("MicroPITA:: Error, did not get a result from analysis.")
+logging.basicConfig(
-		return -1
+stream=args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode='w', level=iLogLevel)
-	logging.info("End microPITA")
+# Run micropita
-	#Log output for debugging
+logging.info("MicroPITA:: Start microPITA")
-	logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples))
+microPITA = MicroPITA()
-	#Write selection to file
+# Argparse will append to the default but will not remove the default so I do this here
-	microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput)
+if not len(args.lstrMethods):
+args.lstrMethods = [ConstantsMicropita.c_strRepresentative]
+dictSelectedSamples = microPITA.funcRun(
+strIDName=args.strIDName,
+strLastMetadataName=args.strLastMetadataName,
+istmInput=args.istmInput,
+ostmInputPredictFile=args.ostmInputPredictFile,
+ostmPredictFile=args.ostmPredictFile,
+ostmCheckedFile=args.ostmCheckedFile,
+ostmOutput=args.ostmOutput,
+cDelimiter=args.cFileDelimiter,
+cFeatureNameDelimiter=args.cFeatureNameDelimiter,
+istmFeatures=args.istmFeatures,
+strFeatureSelection=args.strFeatureSelection,
+iCount=args.iCount,
+strLastRowMetadata=args.strLastFeatureMetadata,
+strLabel=args.strLabel,
+strStratify=args.strUnsupervisedStratify,
+strCustomAlpha=args.strAlphaDiversity,
+strCustomBeta=args.strBetaDiversity,
+strAlphaMetadata=args.strAlphaMetadata,
+istmBetaMatrix=args.istmBetaMatrix,
+istrmTree=args.istrmTree,
+istrmEnvr=args.istrmEnvr,
+lstrMethods=args.lstrMethods,
+fInvertDiversity=args.fInvertDiversity
+)
+if not dictSelectedSamples:
+logging.error("MicroPITA:: Error, did not get a result from analysis.")
+return -1
+logging.info("End microPITA")
+# Log output for debugging
+logging.debug("MicroPITA:: Returned the following samples:" +
+str(dictSelectedSamples))
+# Write selection to file
+microPITA.funcWriteSelectionToFile(
+dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput)
 if __name__ == "__main__":
-	_main( )
+_main()

Mercurial > repos > george-weingart > micropita

comparison MicroPITA.py @ 28:1d09ffab87a7 draft