| 0 | 1 #!/usr/bin/env python | 
|  | 2 """ | 
|  | 3 Author: Timothy Tickle | 
|  | 4 Description: Class to Run analysis for the microPITA paper | 
|  | 5 """ | 
|  | 6 | 
|  | 7 ##################################################################################### | 
|  | 8 #Copyright (C) <2012> | 
|  | 9 # | 
|  | 10 #Permission is hereby granted, free of charge, to any person obtaining a copy of | 
|  | 11 #this software and associated documentation files (the "Software"), to deal in the | 
|  | 12 #Software without restriction, including without limitation the rights to use, copy, | 
|  | 13 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | 
|  | 14 #and to permit persons to whom the Software is furnished to do so, subject to | 
|  | 15 #the following conditions: | 
|  | 16 # | 
|  | 17 #The above copyright notice and this permission notice shall be included in all copies | 
|  | 18 #or substantial portions of the Software. | 
|  | 19 # | 
|  | 20 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | 
|  | 21 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | 
|  | 22 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | 
|  | 23 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | 
|  | 24 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | 
|  | 25 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | 
|  | 26 ##################################################################################### | 
|  | 27 | 
|  | 28 __author__ = "Timothy Tickle" | 
|  | 29 __copyright__ = "Copyright 2012" | 
|  | 30 __credits__ = ["Timothy Tickle"] | 
|  | 31 __license__ = "MIT" | 
|  | 32 __maintainer__ = "Timothy Tickle" | 
|  | 33 __email__ = "ttickle@sph.harvard.edu" | 
|  | 34 __status__ = "Development" | 
|  | 35 | 
|  | 36 import sys | 
|  | 37 import argparse | 
|  | 38 from src.breadcrumbs.src.AbundanceTable import AbundanceTable | 
|  | 39 from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs | 
|  | 40 from src.breadcrumbs.src.Metric import Metric | 
|  | 41 from src.breadcrumbs.src.KMedoids import Kmedoids | 
|  | 42 from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor | 
|  | 43 from src.breadcrumbs.src.SVM import SVM | 
|  | 44 from src.breadcrumbs.src.UtilityMath import UtilityMath | 
|  | 45 | 
|  | 46 from src.ConstantsMicropita import ConstantsMicropita | 
|  | 47 import csv | 
|  | 48 import logging | 
|  | 49 import math | 
|  | 50 import mlpy | 
|  | 51 import numpy as np | 
|  | 52 import operator | 
|  | 53 import os | 
|  | 54 import random | 
|  | 55 import scipy.cluster.hierarchy as hcluster | 
|  | 56 import scipy.spatial.distance | 
|  | 57 from types import * | 
|  | 58 | 
|  | 59 class MicroPITA: | 
|  | 60 	""" | 
|  | 61 	Selects samples from a first tier of a multi-tiered study to be used in a second tier. | 
|  | 62 	Different methods can be used for selection. | 
|  | 63 	The expected input is an abundance table (and potentially a text file of targeted features, | 
|  | 64 	if using the targeted features option). Output is a list of samples exhibiting the | 
|  | 65 	characteristics of interest. | 
|  | 66 	""" | 
|  | 67 | 
|  | 68 	#Constants | 
|  | 69 	#Diversity metrics Alpha | 
|  | 70 	c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity | 
|  | 71 	c_strChao1Diversity = Metric.c_strChao1Diversity | 
|  | 72 | 
|  | 73 	#Diversity metrics Beta | 
|  | 74 	c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity | 
|  | 75 | 
|  | 76 	#Additive inverses of diversity metrics beta | 
|  | 77 	c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity | 
|  | 78 | 
|  | 79 	#Technique Names | 
|  | 80 	ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C" | 
|  | 81 | 
|  | 82 	#Targeted feature settings | 
|  | 83 	c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked | 
|  | 84 	c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance | 
|  | 85 | 
|  | 86 	#Technique groupings | 
|  | 87 #	c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2] | 
|  | 88 | 
|  | 89 	#Converts ecology metrics into standardized method selection names | 
|  | 90 	dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2} | 
|  | 91 #	dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity} | 
|  | 92 	dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative} | 
|  | 93 	dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme} | 
|  | 94 | 
|  | 95 	#Linkage used in the Hierarchical clustering | 
|  | 96 	c_strHierarchicalClusterMethod = 'average' | 
|  | 97 | 
|  | 98 ####Group 1## Diversity | 
|  | 99 	#Testing: Happy path Testing (8) | 
|  | 100 	def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None): | 
|  | 101 		""" | 
|  | 102 		Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given | 
|  | 103 			it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample | 
|  | 104 			names associated with the indices. | 
|  | 105 | 
|  | 106 		:param	lldMatrix:	List of lists [[value,value,value,value],[value,value,value,value]]. | 
|  | 107 		:type:	List of lists	List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample. | 
|  | 108 		:param	lsSampleNames:	List of sample names positionally related (the same) to each list (Optional). | 
|  | 109 		:type:	List of strings	List of strings. | 
|  | 110 		:param	iTopAmount:	The amount of top measured samples (assumes the higher measurements are better). | 
|  | 111 		:type:	integer	Integer amount of sample names/ indices to return. | 
|  | 112 		:return	List:	List of samples to be selected. | 
|  | 113 		""" | 
|  | 114 		topRankListRet = [] | 
|  | 115 		for rowMetrics in lldMatrix: | 
|  | 116 			#Create 2 d array to hold value and index and sort | 
|  | 117 			liIndexX = [rowMetrics,range(len(rowMetrics))] | 
|  | 118 			liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True) | 
|  | 119 | 
|  | 120 			if lsSampleNames: | 
|  | 121 				topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]]) | 
|  | 122 			else: | 
|  | 123 				topRankListRet.append(liIndexX[1][:iTopAmount]) | 
|  | 124 | 
|  | 125 		return topRankListRet | 
|  | 126 | 
|  | 127 	####Group 2## Representative Dissimilarity | 
|  | 128 	#Testing: Happy path tested 1 | 
|  | 129 	def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): | 
|  | 130 		""" | 
|  | 131 		Gets centroid samples by k-medoids clustering of a given matrix. | 
|  | 132 | 
|  | 133 		:param	npaMatrix:	Numpy array where row=features and columns=samples | 
|  | 134 		:type:	Numpy array	Abundance Data. | 
|  | 135 		:param	sMetric:	String name of beta metric used as the distance metric. | 
|  | 136 		:type:	String	String name of beta metric. | 
|  | 137 		:param	lsSampleNames:	The names of the sample | 
|  | 138 		:type:	List	List of strings | 
|  | 139 		:param	iNumberSamplesReturned:	Number of samples to return, each will be a centroid of a sample. | 
|  | 140 		:type:	Integer	Number of samples to return | 
|  | 141 		:return	List:	List of selected samples. | 
|  | 142 		:param	istmBetaMatrix: File with beta-diversity matrix | 
|  | 143 		:type:	File stream or file path string | 
|  | 144 		""" | 
|  | 145 | 
|  | 146 		#Count of how many rows | 
|  | 147 		sampleCount = npaMatrix.shape[0] | 
|  | 148 		if iNumberSamplesReturned > sampleCount: | 
|  | 149 			logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".") | 
|  | 150 			return False | 
|  | 151 | 
|  | 152 		#If the cluster count is equal to the sample count return all samples | 
|  | 153 		if sampleCount == iNumberSamplesReturned: | 
|  | 154 			return list(lsSampleNames) | 
|  | 155 | 
|  | 156 		#Get distance matrix | 
|  | 157 		distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames) | 
|  | 158 		if type(distanceMatrix) is BooleanType: | 
|  | 159 			logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.") | 
|  | 160 			return False | 
|  | 161 | 
|  | 162 		# Handle unifrac output | 
|  | 163 		if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: | 
|  | 164 			distanceMatrix = distanceMatrix[0] | 
|  | 165 | 
|  | 166 		#Log distance matrix | 
|  | 167 		logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric)) | 
|  | 168 | 
|  | 169 		distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True) | 
|  | 170 | 
|  | 171 		#Create object to determine clusters/medoids | 
|  | 172 		medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance) | 
|  | 173 		#medoidsData includes(1d numpy array, medoids indexes; | 
|  | 174 		#			  1d numpy array, non-medoids indexes; | 
|  | 175 		#			  1d numpy array, cluster membership for non-medoids; | 
|  | 176 		#			  double, cost of configuration) | 
|  | 177 		#npaMatrix is samples x rows | 
|  | 178 		#Build a matrix of lists of indicies to pass to the distance matrix | 
|  | 179 		lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))] | 
|  | 180 		medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix)) | 
|  | 181 		logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:") | 
|  | 182 		logging.debug(str(medoidsData)) | 
|  | 183 | 
|  | 184 		#If returning the same amount of clusters and samples | 
|  | 185 		#Return centroids | 
|  | 186 		selectedIndexes = medoidsData[0] | 
|  | 187 		return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)] | 
|  | 188 | 
|  | 189 	####Group 3## Highest Dissimilarity | 
|  | 190 	#Testing: Happy path tested | 
|  | 191 	def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): | 
|  | 192 		""" | 
|  | 193 		Select extreme samples from HClustering. | 
|  | 194 | 
|  | 195 		:param	strBetaMetric:	The beta metric to use for distance matrix generation. | 
|  | 196 		:type:	String	The name of the beta metric to use. | 
|  | 197 		:param	npaAbundanceMatrix:	Numpy array where row=samples and columns=features. | 
|  | 198 		:type:	Numpy Array	Abundance data. | 
|  | 199 		:param	lsSampleNames:	The names of the sample. | 
|  | 200 		:type:	List	List of strings. | 
|  | 201 		:param	iSelectSampleCount:	Number of samples to select (return). | 
|  | 202 		:type:	Integer	Integer number of samples returned. | 
|  | 203 		:return	Samples:	List of samples. | 
|  | 204 		:param	istmBetaMatrix: File with beta-diversity matrix | 
|  | 205 		:type:	File stream or file path string | 
|  | 206 		""" | 
|  | 207 | 
|  | 208 		#If they want all the sample count, return all sample names | 
|  | 209 		iSampleCount=len(npaAbundanceMatrix[:,0]) | 
|  | 210 		if iSelectSampleCount==iSampleCount: | 
|  | 211 		  return lsSampleNames | 
|  | 212 | 
|  | 213 		#Holds the samples to be returned | 
|  | 214 		lsReturnSamplesRet = [] | 
|  | 215 | 
|  | 216 		#Generate beta matrix | 
|  | 217 		#Returns condensed matrix | 
|  | 218 		tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True) | 
|  | 219 | 
|  | 220 		if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: | 
|  | 221 			tempDistanceMatrix = tempDistanceMatrix[0] | 
|  | 222 | 
|  | 223 		if type(tempDistanceMatrix) is BooleanType: | 
|  | 224 			logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.") | 
|  | 225 			return False | 
|  | 226 | 
|  | 227 		if istmBetaMatrix: | 
|  | 228 			tempDistanceMatrix = 1-tempDistanceMatrix | 
|  | 229 | 
|  | 230 		#Feed beta matrix to linkage to cluster | 
|  | 231 		#Send condensed matrix | 
|  | 232 		linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod) | 
|  | 233 | 
|  | 234 		#Extract cluster information from dendrogram | 
|  | 235 		#The linakge matrix is of the form | 
|  | 236 		#[[int1 int2 doube int3],...] | 
|  | 237 		#int1 and int1 are the paired samples indexed at 0 and up. | 
|  | 238 		#each list is an entry for a branch that is number starting with the first | 
|  | 239 		#list being sample count index + 1 | 
|  | 240 		#each list is then named by an increment as they appear | 
|  | 241 		#this means that if a number is in the list and is = sample count or greater it is not | 
|  | 242 		#terminal and is instead a branch. | 
|  | 243 		#This method just takes the lowest metric measurement (highest distance pairs/clusters) | 
|  | 244 		#Works much better than the original technique | 
|  | 245 		#get total number of samples | 
|  | 246 | 
|  | 247 		iCurrentSelectCount = 0 | 
|  | 248 		for row in linkageMatrix: | 
|  | 249 			#Get nodes ofthe lowest pairing (so the furthest apart pair) | 
|  | 250 			iNode1 = int(row[0]) | 
|  | 251 			iNode2 = int(row[1]) | 
|  | 252 			#Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram | 
|  | 253 			#The branching in the dendrogram will start at the number of samples and increment higher. | 
|  | 254 			#Add each of the pair one at a time breaking when enough samples are selected. | 
|  | 255 			if iNode1<iSampleCount: | 
|  | 256 				lsReturnSamplesRet.append(lsSampleNames[iNode1]) | 
|  | 257 				iCurrentSelectCount = iCurrentSelectCount + 1 | 
|  | 258 			if iCurrentSelectCount == iSelectSampleCount: | 
|  | 259 				break | 
|  | 260 			if iNode2<iSampleCount: | 
|  | 261 				lsReturnSamplesRet.append(lsSampleNames[iNode2]) | 
|  | 262 				iCurrentSelectCount = iCurrentSelectCount + 1 | 
|  | 263 			if iCurrentSelectCount == iSelectSampleCount: | 
|  | 264 				break | 
|  | 265 | 
|  | 266 		#Return selected samples | 
|  | 267 		return lsReturnSamplesRet | 
|  | 268 | 
|  | 269 	####Group 4## Rank Average of user Defined Taxa | 
|  | 270 		#Testing: Happy Path Tested | 
|  | 271 	def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False): | 
|  | 272 		""" | 
|  | 273 		Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped. | 
|  | 274 | 
|  | 275 		:param	abndTable:	Abundance Table to analyse | 
|  | 276 		:type:	AbundanceTable	Abundance Table | 
|  | 277 		:param	lsTargetedFeature:	String names | 
|  | 278 		:type:	list	list of string names of features (bugs) which are measured after ranking against the full sample | 
|  | 279 		:param  fRank:	Indicates to rank the abundance before getting the average abundance of the features (default false) | 
|  | 280 		:type:   boolean	Flag indicating ranking abundance before calculating average feature measurement (false= no ranking) | 
|  | 281 		:return	List of lists or boolean:	List of lists or False on error. One internal list per sample indicating the sample, | 
|  | 282 				feature average abundance or ranked abundance. Lists will already be sorted. | 
|  | 283 				For not Ranked [[sample,average abundance of selected feature,1]] | 
|  | 284 				For Ranked [[sample,average ranked abundance, average abundance of selected feature]] | 
|  | 285 				Error Returns false | 
|  | 286 		""" | 
|  | 287 | 
|  | 288 		llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature) | 
|  | 289 		if not llAbundance: | 
|  | 290 			logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") | 
|  | 291 			return False | 
|  | 292 		#Add a space for ranking if needed | 
|  | 293 		#Not ranked will be [[sSample,average abundance,1]] | 
|  | 294 		#(where 1 will not discriminant ties if used in later functions, so this generalizes) | 
|  | 295 		#Ranked will be [[sSample, average rank, average abundance]] | 
|  | 296 		llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance] | 
|  | 297 		#Rank if needed | 
|  | 298 		if fRank: | 
|  | 299 			abndRanked = abndTable.funcRankAbundance() | 
|  | 300 			if abndRanked == None: | 
|  | 301 				logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.") | 
|  | 302 				return False | 
|  | 303 			llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature) | 
|  | 304 			if not llRetRank: | 
|  | 305 				logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") | 
|  | 306 				return False | 
|  | 307 			dictRanks = dict(llRetRank) | 
|  | 308 			llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance] | 
|  | 309 | 
|  | 310 		#Sort first for ties and then for the main feature | 
|  | 311  		if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity: | 
|  | 312 			llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank) | 
|  | 313 		if fRank: | 
|  | 314 			llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank) | 
|  | 315 		return llRetAbundance | 
|  | 316 | 
|  | 317 	#Testing: Happy Path Tested | 
|  | 318 	def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]): | 
|  | 319 	  """ | 
|  | 320 	  Selects samples with the highest ranks or abundance of targeted features. | 
|  | 321 	  If ranked, select the highest abundance for tie breaking | 
|  | 322 | 
|  | 323 	  :param	abndMatrix:	Abundance table to analyse | 
|  | 324 	  :type:	AbundanceTable	Abundance table | 
|  | 325 	  :param	lsTargetedTaxa:	List of features | 
|  | 326 	  :type:	list	list of strings | 
|  | 327 	  :param	iSampleSelectionCount:	Number of samples to select | 
|  | 328 	  :type:	integer	integer | 
|  | 329 	  :param	sMethod:	Method to select targeted features | 
|  | 330 	  :type:	string	String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues) | 
|  | 331 	  :return	List of strings:	List of sample names which were selected | 
|  | 332 	  List of strings	Empty list is returned on an error. | 
|  | 333 	  """ | 
|  | 334 | 
|  | 335 	  #Check data | 
|  | 336 	  if(len(lsTargetedTaxa) < 1): | 
|  | 337 		logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.") | 
|  | 338 		return [] | 
|  | 339 | 
|  | 340 	  lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa, | 
|  | 341 	  	fRank=sMethod.lower() == self.c_strTargetedRanked.lower()) | 
|  | 342 	  #If an error occured or the key word for the method was not recognized | 
|  | 343 	  if lsTargetedSamples == False: | 
|  | 344 		  logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.") | 
|  | 345 		  return [] | 
|  | 346 | 
|  | 347 	  #Select from results | 
|  | 348 	  return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]] | 
|  | 349 | 
|  | 350 	####Group 5## Random | 
|  | 351 	#Testing: Happy path Tested | 
|  | 352 	def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0): | 
|  | 353 		""" | 
|  | 354 		Returns random sample names of the number given. No replacement. | 
|  | 355 | 
|  | 356 		:param	lsSamples:	List of sample names | 
|  | 357 		:type:	list	list of strings | 
|  | 358 		:param	iNumberOfSamplesToReturn:	Number of samples to select | 
|  | 359 		:type:	integer	integer. | 
|  | 360 		:return	List:	List of selected samples (strings). | 
|  | 361 		""" | 
|  | 362 | 
|  | 363 		#Input matrix sample count | 
|  | 364 		sampleCount = len(lsSamples) | 
|  | 365 | 
|  | 366 		#Return the full matrix if they ask for a return matrix where length == original | 
|  | 367 		if(iNumberOfSamplesToReturn >= sampleCount): | 
|  | 368 			return lsSamples | 
|  | 369 | 
|  | 370 		#Get the random indices for the sample (without replacement) | 
|  | 371 		liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn) | 
|  | 372 | 
|  | 373 		#Create a boolean array of if indexes are to be included in the reduced array | 
|  | 374                 return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices] | 
|  | 375 | 
|  | 376 	#Happy path tested (case 3) | 
|  | 377 	def funcGetAveragePopulation(self, abndTable, lfCompress): | 
|  | 378 		""" | 
|  | 379 		Get the average row per column in the abndtable. | 
|  | 380 | 
|  | 381 		:param abndTable: AbundanceTable of data to be averaged | 
|  | 382 		:type: AbudanceTable | 
|  | 383 		:param lfCompress: List of boolean flags (false means to remove sample before averaging | 
|  | 384 		:type: List of floats | 
|  | 385 		:return List of doubles: | 
|  | 386 		""" | 
|  | 387 		if sum(lfCompress) == 0: | 
|  | 388 			return [] | 
|  | 389 | 
|  | 390 		#Get the average populations | 
|  | 391 		lAverageRet = [] | 
|  | 392 | 
|  | 393 		for sFeature in abndTable.funcGetAbundanceCopy(): | 
|  | 394 			sFeature = list(sFeature)[1:] | 
|  | 395 			sFeature=np.compress(lfCompress,sFeature,axis=0) | 
|  | 396 			lAverageRet.append(sum(sFeature)/float(len(sFeature))) | 
|  | 397 		return lAverageRet | 
|  | 398 | 
|  | 399 	#Happy path tested (2 cases) | 
|  | 400 	def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected): | 
|  | 401 		""" | 
|  | 402 		Given an abundance table and an average sample, this returns the distance of each sample | 
|  | 403 		(measured using brays-curtis dissimilarity) from the average. | 
|  | 404 		The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected | 
|  | 405 		(which is associated with the samples in the order of the samples in the abundance table; | 
|  | 406 		use abundancetable.funcGetSampleNames() to see the order if needed). | 
|  | 407 | 
|  | 408 		:param abndTable: Abundance table holding the data to be analyzed. | 
|  | 409 		:type: AbundanceTable | 
|  | 410 		:param ldAverage: Average population (Average features of the abundance table of samples) | 
|  | 411 		:type: List of doubles which represent the average population | 
|  | 412 		:param lsSamples: These are the only samples used in the analysis | 
|  | 413 		:type: List of strings (sample ids) | 
|  | 414 		:param lfSelected: Samples to be included in the analysis | 
|  | 415 		:type: List of boolean (true means include) | 
|  | 416 		:return: List of distances (doubles) | 
|  | 417 		""" | 
|  | 418 		#Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists | 
|  | 419 		ldSelectedDistances = [] | 
|  | 420 | 
|  | 421 		for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]: | 
|  | 422 			#Get the sample measurements | 
|  | 423 			ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0]) | 
|  | 424 		return ldSelectedDistances | 
|  | 425 | 
|  | 426 	#Happy path tested (1 case) | 
|  | 427 	def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther): | 
|  | 428 		""" | 
|  | 429 		Get the distance of samples from one label from the average sample of not the label. | 
|  | 430 		Note: This assumes 2 classes. | 
|  | 431 | 
|  | 432 		:param abndTable: Table of data to work out of. | 
|  | 433 		:type: Abundace Table | 
|  | 434 		:param lfGroupOfInterest: Boolean indicator of the sample being in the first group. | 
|  | 435 		:type: List of floats, true indicating an individual in the group of interest. | 
|  | 436 		:param lfGroupOther:	Boolean indicator of the sample being in the other group. | 
|  | 437 		:type:	List of floats, true indicating an individual in the | 
|  | 438 		:return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population] | 
|  | 439 		""" | 
|  | 440 		#Get all sample names | 
|  | 441 		lsAllSamples = abndTable.funcGetSampleNames() | 
|  | 442 | 
|  | 443 		#Get average populations | 
|  | 444 		lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther) | 
|  | 445 | 
|  | 446 		#Get the distance from the average of the other label (label 1) | 
|  | 447 		ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther, | 
|  | 448 			lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest) | 
|  | 449 | 
|  | 450 		return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances) | 
|  | 451 | 
|  | 452 	#Happy path tested (1 test case) | 
|  | 453 	def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest): | 
|  | 454 		""" | 
|  | 455 		Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group. | 
|  | 456 		An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group. | 
|  | 457 | 
|  | 458 		:params  abndTable: Abundance of measurements | 
|  | 459 		:type: AbundanceTable | 
|  | 460 		:params iSelectionCount: The number of samples selected per sample. | 
|  | 461 		:type: Integer Integer greater than 0 | 
|  | 462 		:params sLabel: ID of the metadata which is the supervised label | 
|  | 463 		:type: String | 
|  | 464 		:params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest. | 
|  | 465 		:type: String found in the abundance table metadata row indicated by sLabel. | 
|  | 466 		:return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]] | 
|  | 467 		""" | 
|  | 468 | 
|  | 469 		lsMetadata = abndTable.funcGetMetadata(sLabel) | 
|  | 470 		#Other metadata values | 
|  | 471 		lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest)) | 
|  | 472 | 
|  | 473 		#Get boolean indicator of values of interest | 
|  | 474 		lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata] | 
|  | 475 | 
|  | 476                 #Get the distances of the items of interest from the other metadata values | 
|  | 477 		dictDistanceAverages = {} | 
|  | 478                 for sOtherLabel in lsUniqueOtherValues: | 
|  | 479 			#Get boolean indicator of labels not of interest | 
|  | 480 			lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata] | 
|  | 481 | 
|  | 482 			#Get the distances of data from two different groups to the average of the other | 
|  | 483 			ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther)) | 
|  | 484 | 
|  | 485 			for sKey in ldValueDistances: | 
|  | 486 				dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey] | 
|  | 487 | 
|  | 488 		#Finish average by dividing by length of lsUniqueOtherValues | 
|  | 489 		ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages] | 
|  | 490 | 
|  | 491                 #Sort to extract extremes | 
|  | 492                 ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1)) | 
|  | 493 | 
|  | 494 		#Get the closest and farthest distances | 
|  | 495 		ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount] | 
|  | 496 		ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:] | 
|  | 497 | 
|  | 498 		#Remove the selected samples from the larger population of distances (better visualization) | 
|  | 499 		ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples] | 
|  | 500 | 
|  | 501 		#Return discriminant tuples, distinct tuples, other tuples | 
|  | 502 		return [ltupleDiscriminantSamples, ltupleDistinctSamples, | 
|  | 503 			   [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]] | 
|  | 504 | 
|  | 505 	#Run the supervised method surrounding distance from centroids | 
|  | 506 	#Happy path tested (3 test cases) | 
|  | 507 	def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant, | 
|  | 508 						xOutputSupFile, xPredictSupFile, strSupervisedMetadata, | 
|  | 509 						iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False): | 
|  | 510 		""" | 
|  | 511 		Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group. | 
|  | 512 | 
|  | 513 		:param	abundanceTable:	AbundanceTable | 
|  | 514 		:type:	AbudanceTable	Data to analyze | 
|  | 515 		:param	fRunDistinct:	Run distinct selection method | 
|  | 516 		:type:	Boolean	boolean (true runs method) | 
|  | 517 		:param	fRunDiscriminant:	Run discriminant method | 
|  | 518 		:type:	Boolean	boolean (true runs method) | 
|  | 519 		:param	xOutputSupFile:	File output from supervised methods detailing data going into the method. | 
|  | 520 		:type:	String or FileStream | 
|  | 521 		:param	xPredictSupFile:	File output from supervised methods distance results from supervised methods. | 
|  | 522 		:type:	String or FileStream | 
|  | 523 		:param strSupervisedMetadata:	The metadata that will be used to group samples. | 
|  | 524 		:type:	String | 
|  | 525 		:param	iSampleSupSelectionCount:	Number of samples to select | 
|  | 526 		:type:	Integer	int sample selection count | 
|  | 527 		:param lsOriginalSampleNames:	List of the sample names, order is important and should be preserved from the abundanceTable. | 
|  | 528 		:type:	List of samples | 
|  | 529 		:param	fAppendFiles:	Indicates that output files already exist and appending is occuring. | 
|  | 530 		:type:	Boolean | 
|  | 531 		:return	Selected Samples:	A dictionary of selected samples by selection ID | 
|  | 532 		Dictionary	{"Selection Method":["SampleID","SampleID"...]} | 
|  | 533 		""" | 
|  | 534 		#Get labels and run one label against many | 
|  | 535 		lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata) | 
|  | 536 		dictlltpleDistanceMeasurements = {} | 
|  | 537 		for sMetadataValue in set(lstrMetadata): | 
|  | 538 | 
|  | 539 			#For now perform the selection here for the label of interest against the other labels | 
|  | 540 			dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable, | 
|  | 541 				iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue)) | 
|  | 542 | 
|  | 543 		#Make expected output files for supervised methods | 
|  | 544 		#1. Output file which is similar to an input file for SVMs | 
|  | 545 		#2. Output file that is similar to the probabilitic output of a SVM (LibSVM) | 
|  | 546 		#Manly for making output of supervised methods (Distance from Centroid) similar | 
|  | 547 		#MicropitaVis needs some of these files | 
|  | 548 		if xOutputSupFile: | 
|  | 549 			if fAppendFiles: | 
|  | 550 				SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, | 
|  | 551 					lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) | 
|  | 552 			else: | 
|  | 553 				SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, | 
|  | 554 					sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) | 
|  | 555 | 
|  | 556 		#Will contain the samples selected to return | 
|  | 557 		#One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type | 
|  | 558 		dictSelectedSamplesRet = dict() | 
|  | 559 		for sKey, ltplDistances in dictlltpleDistanceMeasurements.items(): | 
|  | 560 			if fRunDistinct: | 
|  | 561 				dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]]) | 
|  | 562 			if fRunDiscriminant: | 
|  | 563 				dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]]) | 
|  | 564 | 
|  | 565 		if xPredictSupFile: | 
|  | 566 			dictFlattenedDistances = dict() | 
|  | 567 			[dictFlattenedDistances.setdefault(sKey, []).append(tple) | 
|  | 568 				for sKey, lltple in dictlltpleDistanceMeasurements.items() | 
|  | 569 				for ltple in lltple for tple in ltple] | 
|  | 570 			if fAppendFiles: | 
|  | 571 				self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, | 
|  | 572 					dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) | 
|  | 573 			else: | 
|  | 574 				self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, | 
|  | 575 					dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) | 
|  | 576 		return dictSelectedSamplesRet | 
|  | 577 | 
|  | 578 	#Two happy path test cases | 
|  | 579 	def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames): | 
|  | 580 		""" | 
|  | 581 		Manages updating the predict file. | 
|  | 582 | 
|  | 583 		:param	xPredictSupFile: File that has predictions (distances) from the supervised method. | 
|  | 584 		:type:	FileStream or String file path | 
|  | 585 		:param	xInputLabelsFile: File that as input to the supervised methods. | 
|  | 586 		:type:	FileStream or String file path | 
|  | 587 		:param	dictltpleDistanceMeasurements: | 
|  | 588 		:type:	Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} | 
|  | 589 		""" | 
|  | 590 | 
|  | 591 		if not isinstance(xPredictSupFile, str): | 
|  | 592 			xPredictSupFile.close() | 
|  | 593 			xPredictSupFile = xPredictSupFile.name | 
|  | 594 		csvr = open(xPredictSupFile,'r') | 
|  | 595 | 
|  | 596 		f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) | 
|  | 597 		lsHeader = f.next()[1:] | 
|  | 598 		dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader]) | 
|  | 599 | 
|  | 600 		#Read data in | 
|  | 601 		iSampleIndex = 0 | 
|  | 602 		for sRow in f: | 
|  | 603 			sLabel = sRow[0] | 
|  | 604 			[dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:]) | 
|  | 605 				if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue] | 
|  | 606 			iSampleIndex += 1 | 
|  | 607 | 
|  | 608 		#Combine dictltpleDistanceMeasurements with new data | 
|  | 609 		#If they share a key then merge keeping parameter data | 
|  | 610 		#If they do not share the key, keep the full data | 
|  | 611 		dictNew = {} | 
|  | 612 		for sKey in dictltpleDistanceMeasurements.keys(): | 
|  | 613 			lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]] | 
|  | 614 			dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey] | 
|  | 615                 for sKey in dictlltpleRead: | 
|  | 616 			if sKey not in dictltpleDistanceMeasurements.keys(): | 
|  | 617 				dictNew[sKey] = dictlltpleRead[sKey] | 
|  | 618 | 
|  | 619 		#Call writer | 
|  | 620 		self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile, | 
|  | 621 			dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable, | 
|  | 622 			lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True) | 
|  | 623 | 
|  | 624 	#2 happy path test cases | 
|  | 625         def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False): | 
|  | 626 		""" | 
|  | 627 		Write to the predict file. | 
|  | 628 | 
|  | 629 		:param	xPredictSupFile: File that has predictions (distances) from the supervised method. | 
|  | 630 		:type:	FileStream or String file path | 
|  | 631 		:param	xInputLabelsFile: File that as input to the supervised methods. | 
|  | 632 		:type:	FileStream or String file path | 
|  | 633 		:param	dictltpleDistanceMeasurements: | 
|  | 634 		:type:	Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} | 
|  | 635 		:param	abundanceTable: An abundance table of the sample data. | 
|  | 636 		:type:	AbundanceTable | 
|  | 637 		:param	lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing. | 
|  | 638 			Otherwise will use the sample names from the abundance table. | 
|  | 639 		:type:	List of strings | 
|  | 640 		:param	fFromUpdate:	Indicates if this is part of an update to the file or not. | 
|  | 641 		:type:	Boolean | 
|  | 642 		""" | 
|  | 643 | 
|  | 644 		xInputLabelsFileName = xInputLabelsFile | 
|  | 645 		if not isinstance(xInputLabelsFile,str): | 
|  | 646 			xInputLabelsFileName = xInputLabelsFile.name | 
|  | 647 		f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) | 
|  | 648 | 
|  | 649 		lsAllSampleNames = abundanceTable.funcGetSampleNames() | 
|  | 650 		lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames, | 
|  | 651 						isPredictFile=False) | 
|  | 652 		dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]]) | 
|  | 653 | 
|  | 654 		#Dictionay keys will be used to order the predict file | 
|  | 655 		lsMeasurementKeys = dictltpleDistanceMeasurements.keys() | 
|  | 656 		#Make header | 
|  | 657 		f.writerow(["labels"]+lsMeasurementKeys) | 
|  | 658 | 
|  | 659 		#Reformat dictionary to make it easier to use | 
|  | 660 		for sKey in dictltpleDistanceMeasurements: | 
|  | 661 			dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]]) | 
|  | 662 | 
|  | 663 		for sSample in lsOriginalSampleNames: | 
|  | 664 			#Make body of file | 
|  | 665 			f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+ | 
|  | 666 				[str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)) | 
|  | 667 				for sKey in lsMeasurementKeys]) | 
|  | 668 | 
|  | 669 	def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics, | 
|  | 670 												fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None, | 
|  | 671 												istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False): | 
|  | 672 		""" | 
|  | 673 		Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other | 
|  | 674 		for the set that should be normalized. | 
|  | 675 | 
|  | 676 		:param	abndData:	Abundance table object holding the samples to be measured. | 
|  | 677 		:type:	AbundanceTable | 
|  | 678 		:param	iSampleSelectionCount	The number of samples to select per method. | 
|  | 679 		:type:	Integer | 
|  | 680 		:param	dictSelectedSamples	Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}. | 
|  | 681 		:type:	Dictionary | 
|  | 682 		:param	lsAlphaMetrics:	List of alpha metrics to use on alpha metric dependent assays (like highest diversity). | 
|  | 683 		:type:	List of strings | 
|  | 684 		:param	lsBetaMetrics:	List of beta metrics to use on beta metric dependent assays (like most representative). | 
|  | 685 		:type:	List of strings | 
|  | 686 		:param	lsInverseBetaMetrics:	List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar). | 
|  | 687 		:type:	List of strings | 
|  | 688 		:param	fRunDiversity:	Run Diversity based methods (true indicates run). | 
|  | 689 		:type:	Boolean | 
|  | 690 		:param	fRunRepresentative:	Run Representative based methods (true indicates run). | 
|  | 691 		:type:	Boolean | 
|  | 692 		:param	fRunExtreme:	Run Extreme based methods (true indicates run). | 
|  | 693 		:type:	Boolean | 
|  | 694 		:param	istmBetaMatrix:	File that has a precalculated beta matrix | 
|  | 695 		:type:	File stream or File path string | 
|  | 696 		:return	Selected Samples:	Samples selected by methods. | 
|  | 697 				Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]} | 
|  | 698 		""" | 
|  | 699 | 
|  | 700 		#Sample ids/names | 
|  | 701 		lsSampleNames = abndData.funcGetSampleNames() | 
|  | 702 | 
|  | 703 		#Generate alpha metrics and get most diverse | 
|  | 704 		if fRunDiversity: | 
|  | 705 | 
|  | 706 			#Get Alpha metrics matrix | 
|  | 707 			internalAlphaMatrix = None | 
|  | 708 			#Name of technique | 
|  | 709 			strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics | 
|  | 710 | 
|  | 711 			#If given an alpha-diversity metadata | 
|  | 712 			if strAlphaMetadata: | 
|  | 713 				internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]] | 
|  | 714 			else: | 
|  | 715 				#Expects Observations (Taxa (row) x sample (column)) | 
|  | 716 				#Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]] | 
|  | 717 				internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy() | 
|  | 718 							if not abndData.funcIsSummed() | 
|  | 719 							else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(), | 
|  | 720 							lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics) | 
|  | 721 | 
|  | 722 			if internalAlphaMatrix: | 
|  | 723 				#Invert measurments | 
|  | 724 				if fInvertDiversity: | 
|  | 725 					lldNewDiversity = [] | 
|  | 726 					for lsLine in internalAlphaMatrix: | 
|  | 727 						lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine]) | 
|  | 728 					internalAlphaMatrix = lldNewDiversity | 
|  | 729 				#Get top ranked alpha diversity by most diverse | 
|  | 730 				#Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...] | 
|  | 731 				#Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]] | 
|  | 732 				mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount) | 
|  | 733 | 
|  | 734 				#Add to results | 
|  | 735 				for index in xrange(0,len(strMethod)): | 
|  | 736 					strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index]) | 
|  | 737 					dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index]) | 
|  | 738 | 
|  | 739 		logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b") | 
|  | 740 		logging.info(dictSelectedSamples) | 
|  | 741 | 
|  | 742 		#Generate beta metrics and | 
|  | 743 		if fRunRepresentative or fRunExtreme: | 
|  | 744 | 
|  | 745 			#Abundance matrix transposed | 
|  | 746 			npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True) | 
|  | 747 | 
|  | 748 			#Get center selection using clusters/tiling | 
|  | 749 			#This will be for beta metrics in normalized space | 
|  | 750 			if fRunRepresentative: | 
|  | 751 | 
|  | 752 				if istmBetaMatrix: | 
|  | 753 					#Get representative dissimilarity samples | 
|  | 754 					medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | 
|  | 755 | 
|  | 756 					if medoidSamples: | 
|  | 757 						dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples) | 
|  | 758 				else: | 
|  | 759 					logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.") | 
|  | 760 					for bMetric in lsBetaMetrics: | 
|  | 761 | 
|  | 762 						#Get representative dissimilarity samples | 
|  | 763 						medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | 
|  | 764 | 
|  | 765 						if medoidSamples: | 
|  | 766 							dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples) | 
|  | 767 | 
|  | 768 			#Get extreme selection using clusters, tiling | 
|  | 769 			if fRunExtreme: | 
|  | 770 				logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.") | 
|  | 771 				if istmBetaMatrix: | 
|  | 772 | 
|  | 773 					#Samples for representative dissimilarity | 
|  | 774 					#This involves inverting the distance metric, | 
|  | 775 					#Taking the dendrogram level of where the number cluster == the number of samples to select | 
|  | 776 					#Returning a repersentative sample from each cluster | 
|  | 777 					extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | 
|  | 778 | 
|  | 779 					#Add selected samples | 
|  | 780 					if extremeSamples: | 
|  | 781 						dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples) | 
|  | 782 | 
|  | 783 				else: | 
|  | 784 					#Run KMedoids with inverse custom distance metric in normalized space | 
|  | 785 					for bMetric in lsInverseBetaMetrics: | 
|  | 786 | 
|  | 787 						#Samples for representative dissimilarity | 
|  | 788 						#This involves inverting the distance metric, | 
|  | 789 						#Taking the dendrogram level of where the number cluster == the number of samples to select | 
|  | 790 						#Returning a repersentative sample from each cluster | 
|  | 791 						extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | 
|  | 792 | 
|  | 793 						#Add selected samples | 
|  | 794 						if extremeSamples: | 
|  | 795 							dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples) | 
|  | 796 | 
|  | 797 		logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b") | 
|  | 798 		logging.info(dictSelectedSamples) | 
|  | 799 		return dictSelectedSamples | 
|  | 800 | 
|  | 801 	def funcRun(self, strIDName, strLastMetadataName, istmInput, | 
|  | 802 					  ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput, | 
|  | 803 					  cDelimiter, cFeatureNameDelimiter, strFeatureSelection, | 
|  | 804 					  istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None, | 
|  | 805 					  strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None, | 
|  | 806 					  iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False): | 
|  | 807 		""" | 
|  | 808 		Manages the selection of samples given different metrics. | 
|  | 809 | 
|  | 810 		:param	strIDName: Sample Id metadata row | 
|  | 811 		:type:	String | 
|  | 812 		:param	strLastMetadataName: The id of the metadata positioned last in the abundance table. | 
|  | 813 		:type:	String	String metadata id. | 
|  | 814 		:param	istmInput: File to store input data to supervised methods. | 
|  | 815 		:type:	FileStream of String file path | 
|  | 816 		:param	ostmInputPredictFile: File to store distances from supervised methods. | 
|  | 817 		:type:	FileStream or String file path | 
|  | 818 		:param	ostmCheckedFile: File to store the AbundanceTable data after it is being checked. | 
|  | 819 		:type:	FileStream or String file path | 
|  | 820 		:param	ostmOutPut: File to store sample selection by methods of interest. | 
|  | 821 		:type:	FileStream or String file path | 
|  | 822 		:param	cDelimiter: Delimiter of abundance table. | 
|  | 823 		:type:	Character Char (default TAB). | 
|  | 824 		:param	cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades). | 
|  | 825 		:type:	Character (default |). | 
|  | 826 		:param	stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance). | 
|  | 827 		:type:	String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues). | 
|  | 828 		:param	istmFeatures: File which holds the features of interest if using targeted feature methodology. | 
|  | 829 		:type:	FileStream or String file path | 
|  | 830 		:param	iCount:	Number of samples to select in each methods, supervised methods select this amount per label if possible. | 
|  | 831 		:type:	Integer	integer. | 
|  | 832 		:param	lstrMethods: List of strings indicating selection techniques. | 
|  | 833 		:type:	List of string method names | 
|  | 834 		:param	strLabel: The metadata used for supervised labels. | 
|  | 835 		:type:	String | 
|  | 836 		:param	strStratify: The metadata used to stratify unsupervised data. | 
|  | 837 		:type:	String | 
|  | 838 		:param	strCustomAlpha: Custom alpha diversity metric | 
|  | 839 		:type:	String | 
|  | 840 		:param	strCustomBeta: Custom beta diversity metric | 
|  | 841 		:type:	String | 
|  | 842 		:param	strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling | 
|  | 843 		:type:	String | 
|  | 844 		:param	istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling | 
|  | 845 		:type:	FileStream or String file path | 
|  | 846 		:param	istrmTree: File containing tree for phylogentic beta-diversity analysis | 
|  | 847 		:type:	FileStream or String file path | 
|  | 848 		:param	istrmEnvr: File containing environment for phylogentic beta-diversity analysis | 
|  | 849 		:type:	FileStream or String file path | 
|  | 850 		:param	iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples. | 
|  | 851 		:type:	Integer | 
|  | 852 		:param	iMinSamples: Minimum sample count for the occurence filter. | 
|  | 853 		:type:	Integer | 
|  | 854 		:param	fInvertDiversity: When true will invert diversity measurements before using. | 
|  | 855 		:type:	boolean | 
|  | 856 		:return	Selected Samples:	Samples selected by methods. | 
|  | 857 				Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]} | 
|  | 858 		""" | 
|  | 859 | 
|  | 860 		#Holds the top ranked samples from different metrics | 
|  | 861 		#dict[metric name] = [samplename,samplename...] | 
|  | 862 		selectedSamples = dict() | 
|  | 863 | 
|  | 864 		#If a target feature file is given make sure that targeted feature is in the selection methods, if not add | 
|  | 865 		if ConstantsMicropita.c_strFeature in lstrMethods: | 
|  | 866 		  if not istmFeatures: | 
|  | 867 			logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.") | 
|  | 868 			return False | 
|  | 869 | 
|  | 870 		#Diversity metrics to run | 
|  | 871 		#Use custom metrics if specified | 
|  | 872                 #Custom beta metrics set to normalized only, custom alpha metrics set to count only | 
|  | 873 		diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity] | 
|  | 874 		diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity] | 
|  | 875 #		inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity] | 
|  | 876 		diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else [] | 
|  | 877 		diversityMetricsBetaNoNormalize = [] | 
|  | 878 #		inverseDiversityMetricsBetaNoNormalize = [] | 
|  | 879 | 
|  | 880 		#Targeted taxa | 
|  | 881 		userDefinedTaxa = [] | 
|  | 882 | 
|  | 883 		#Perform different flows flags | 
|  | 884 		c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods | 
|  | 885 		c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods | 
|  | 886 		c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods | 
|  | 887 		c_RUN_RANK_AVERAGE_USER_4 = False | 
|  | 888 		if ConstantsMicropita.c_strFeature in lstrMethods: | 
|  | 889 			c_RUN_RANK_AVERAGE_USER_4 = True | 
|  | 890 			if not istmFeatures: | 
|  | 891 				logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.") | 
|  | 892 				return False | 
|  | 893 			#Read in taxa list, break down to lines and filter out empty strings | 
|  | 894 			userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines())) | 
|  | 895 		c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods | 
|  | 896 		c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods | 
|  | 897 		c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods | 
|  | 898 | 
|  | 899 		#Read in abundance data | 
|  | 900 		#Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0 | 
|  | 901 		#Abundance table object to read in and manage data | 
|  | 902 		totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples], | 
|  | 903 								cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata, | 
|  | 904 								sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile) | 
|  | 905 		if not totalAbundanceTable: | 
|  | 906 			logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+ | 
|  | 907 				" This often occurs when the Last Metadata is not specified correctly."+ | 
|  | 908 				" Please check to make sure the Last Metadata selection is the row of the last metadata,"+ | 
|  | 909 				" all values after this selection should be microbial measurements and should be numeric.") | 
|  | 910 			return False | 
|  | 911 | 
|  | 912 		lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel | 
|  | 913 | 
|  | 914 		dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy() | 
|  | 915 		logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata)) | 
|  | 916 		#If there is only 1 unique value for the labels, do not run the Supervised methods | 
|  | 917 		if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ): | 
|  | 918 			logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[]))) | 
|  | 919 			return False | 
|  | 920 | 
|  | 921 		#Run unsupervised methods### | 
|  | 922 		#Stratify the data if need be and drop the old data | 
|  | 923 		lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable] | 
|  | 924 | 
|  | 925 		#For each stratified abundance block or for the unstratfified abundance | 
|  | 926 		#Run the unsupervised blocks | 
|  | 927 		fAppendSupFiles = False | 
|  | 928 		for stratAbundanceTable in lStratifiedAbundanceTables: | 
|  | 929 			logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName()) | 
|  | 930 | 
|  | 931  			###NOT SUMMED, NOT NORMALIZED | 
|  | 932 			#Only perform if the data is not yet normalized | 
|  | 933 			if not stratAbundanceTable.funcIsNormalized( ): | 
|  | 934 				#Need to first work with unnormalized data | 
|  | 935 				if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: | 
|  | 936 | 
|  | 937 					self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, | 
|  | 938 													 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize, | 
|  | 939 													 lsBetaMetrics=diversityMetricsBetaNoNormalize, | 
|  | 940 													 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize, | 
|  | 941 													 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, | 
|  | 942 													 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata, | 
|  | 943                                                                                                          istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) | 
|  | 944 | 
|  | 945 | 
|  | 946 			#Generate selection by the rank average of user defined taxa | 
|  | 947 			#Expects (Taxa (row) by Samples (column)) | 
|  | 948 			#Expects a column 0 of taxa id that is skipped | 
|  | 949 			#Returns [(sample name,average,rank)] | 
|  | 950 			#SUMMED AND NORMALIZED | 
|  | 951 			stratAbundanceTable.funcSumClades() | 
|  | 952 			#Normalize data at this point | 
|  | 953 			stratAbundanceTable.funcNormalize() | 
|  | 954 			if c_RUN_RANK_AVERAGE_USER_4: | 
|  | 955 				selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable, | 
|  | 956 						lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection) | 
|  | 957 				logging.info("MicroPITA.funcRun:: Selected Samples Rank") | 
|  | 958 				logging.info(selectedSamples) | 
|  | 959 | 
|  | 960  			###SUMMED AND NORMALIZED analysis block | 
|  | 961 			#Diversity based metric will move reduce to terminal taxa as needed | 
|  | 962 			if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: | 
|  | 963 | 
|  | 964 				self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, | 
|  | 965 												 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha, | 
|  | 966 												 lsBetaMetrics=diversityMetricsBeta, | 
|  | 967 												 lsInverseBetaMetrics=diversityMetricsBeta, | 
|  | 968 												 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, | 
|  | 969 												 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, | 
|  | 970                                                                                                  istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) | 
|  | 971 | 
|  | 972 			#5::Select randomly | 
|  | 973 			#Expects sampleNames = List of sample names [name, name, name...] | 
|  | 974 			if(c_RUN_RANDOM_5): | 
|  | 975 				#Select randomly from sample names | 
|  | 976 				selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount) | 
|  | 977 				logging.info("MicroPITA.funcRun:: Selected Samples Random") | 
|  | 978 				logging.info(selectedSamples) | 
|  | 979 | 
|  | 980 			#Perform supervised selection | 
|  | 981 			if c_RUN_DISTINCT or c_RUN_DISCRIMINANT: | 
|  | 982  				if strLabel: | 
|  | 983 					dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable, | 
|  | 984 								fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT, | 
|  | 985 								xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile, | 
|  | 986 								strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount, | 
|  | 987 								lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(), | 
|  | 988 								lsOriginalLabels = lsOriginalLabels, | 
|  | 989 								fAppendFiles=fAppendSupFiles) | 
|  | 990 | 
|  | 991 					[selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()] | 
|  | 992 | 
|  | 993 					if not fAppendSupFiles: | 
|  | 994 						fAppendSupFiles = True | 
|  | 995 					logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised") | 
|  | 996 					logging.info(selectedSamples) | 
|  | 997 		return selectedSamples | 
|  | 998 | 
|  | 999 	#Testing: Happy path tested | 
|  | 1000 	@staticmethod | 
|  | 1001 	def funcWriteSelectionToFile(dictSelection,xOutputFilePath): | 
|  | 1002 		""" | 
|  | 1003 		Writes the selection of samples by method to an output file. | 
|  | 1004 | 
|  | 1005 		:param	dictSelection:	The dictionary of selections by method to be written to a file. | 
|  | 1006 		:type:	Dictionary	The dictionary of selections by method {"method":["sample selected","sample selected"...]} | 
|  | 1007 		:param	xOutputFilePath:	FileStream or String path to file inwhich the dictionary is written. | 
|  | 1008 		:type:	String	FileStream or String path to file | 
|  | 1009 		""" | 
|  | 1010 | 
|  | 1011 		if not dictSelection: | 
|  | 1012 			return | 
|  | 1013 | 
|  | 1014 		#Open file | 
|  | 1015 		f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim ) | 
|  | 1016 | 
|  | 1017 		#Create output content from dictionary | 
|  | 1018 		for sKey in dictSelection: | 
|  | 1019 			f.writerow([sKey]+dictSelection[sKey]) | 
|  | 1020 			logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey])) | 
|  | 1021 | 
|  | 1022 	#Testing: Happy Path tested | 
|  | 1023 	@staticmethod | 
|  | 1024 	def funcReadSelectionFileToDictionary(xInputFile): | 
|  | 1025 		""" | 
|  | 1026 		Reads in an output selection file from micropita and formats it into a dictionary. | 
|  | 1027 | 
|  | 1028 		:param	xInputFile:	String path to file or file stream to read and translate into a dictionary. | 
|  | 1029 									{"method":["sample selected","sample selected"...]} | 
|  | 1030 		:type:	FileStream or String Path to file | 
|  | 1031 		:return	Dictionary:	Samples selected by methods. | 
|  | 1032 					Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]} | 
|  | 1033 		""" | 
|  | 1034 | 
|  | 1035 		#Open file | 
|  | 1036 		istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim) | 
|  | 1037 | 
|  | 1038 		#Dictionary to hold selection data | 
|  | 1039 		return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader]) | 
|  | 1040 | 
|  | 1041 #Set up arguments reader | 
|  | 1042 argp = argparse.ArgumentParser( prog = "MicroPITA.py", | 
|  | 1043 	description = """Selects samples from abundance tables based on various selection schemes.""" ) | 
|  | 1044 | 
|  | 1045 args = argp.add_argument_group( "Common", "Commonly modified options" ) | 
|  | 1046 args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp) | 
|  | 1047 args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp, | 
|  | 1048 	choices = ConstantsMicropita.c_lsAllMethods, action = "append") | 
|  | 1049 | 
|  | 1050 args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" ) | 
|  | 1051 args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp,  choices = Metric.setAlphaDiversities) | 
|  | 1052 args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp,  choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]) | 
|  | 1053 args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp) | 
|  | 1054 args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp) | 
|  | 1055 args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp) | 
|  | 1056 args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp) | 
|  | 1057 args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp) | 
|  | 1058 | 
|  | 1059 args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" ) | 
|  | 1060 args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp) | 
|  | 1061 args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None, | 
|  | 1062 				  help= ConstantsMicropita.c_strLastMetadataNameHelp) | 
|  | 1063 args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0], | 
|  | 1064 				  choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp) | 
|  | 1065 args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp) | 
|  | 1066 args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp) | 
|  | 1067 | 
|  | 1068 args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" ) | 
|  | 1069 args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp) | 
|  | 1070 args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id", | 
|  | 1071 				  help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp) | 
|  | 1072 | 
|  | 1073 args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" ) | 
|  | 1074 args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp) | 
|  | 1075 args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp) | 
|  | 1076 | 
|  | 1077 args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" ) | 
|  | 1078 args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING", | 
|  | 1079 				  choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp) | 
|  | 1080 args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp) | 
|  | 1081 args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp) | 
|  | 1082 args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp) | 
|  | 1083 args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp) | 
|  | 1084 | 
|  | 1085 argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp, | 
|  | 1086 	default = sys.stdin) | 
|  | 1087 argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp, | 
|  | 1088 	default = sys.stdout) | 
|  | 1089 | 
|  | 1090 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__ | 
|  | 1091 | 
|  | 1092 def _main( ): | 
|  | 1093 	args = argp.parse_args( ) | 
|  | 1094 | 
|  | 1095 	#Set up logger | 
|  | 1096 	iLogLevel = getattr(logging, args.strLogLevel.upper(), None) | 
|  | 1097 	logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel) | 
|  | 1098 | 
|  | 1099 	#Run micropita | 
|  | 1100 	logging.info("MicroPITA:: Start microPITA") | 
|  | 1101 	microPITA = MicroPITA() | 
|  | 1102 | 
|  | 1103 	#Argparse will append to the default but will not remove the default so I do this here | 
|  | 1104 	if not len(args.lstrMethods): | 
|  | 1105 		args.lstrMethods = [ConstantsMicropita.c_strRepresentative] | 
|  | 1106 | 
|  | 1107 	dictSelectedSamples = microPITA.funcRun( | 
|  | 1108 		strIDName		= args.strIDName, | 
|  | 1109 		strLastMetadataName	= args.strLastMetadataName, | 
|  | 1110 		istmInput		= args.istmInput, | 
|  | 1111 		ostmInputPredictFile	= args.ostmInputPredictFile, | 
|  | 1112 		ostmPredictFile		= args.ostmPredictFile, | 
|  | 1113 		ostmCheckedFile		= args.ostmCheckedFile, | 
|  | 1114 		ostmOutput		= args.ostmOutput, | 
|  | 1115 		cDelimiter		= args.cFileDelimiter, | 
|  | 1116 		cFeatureNameDelimiter	= args.cFeatureNameDelimiter, | 
|  | 1117 		istmFeatures		= args.istmFeatures, | 
|  | 1118 		strFeatureSelection	= args.strFeatureSelection, | 
|  | 1119 		iCount			= args.iCount, | 
|  | 1120 		strLastRowMetadata	= args.strLastFeatureMetadata, | 
|  | 1121 		strLabel		= args.strLabel, | 
|  | 1122 		strStratify		= args.strUnsupervisedStratify, | 
|  | 1123 		strCustomAlpha		= args.strAlphaDiversity, | 
|  | 1124 		strCustomBeta		= args.strBetaDiversity, | 
|  | 1125 		strAlphaMetadata	= args.strAlphaMetadata, | 
|  | 1126 		istmBetaMatrix		= args.istmBetaMatrix, | 
|  | 1127 		istrmTree		= args.istrmTree, | 
|  | 1128 		istrmEnvr		= args.istrmEnvr, | 
|  | 1129 		lstrMethods		= args.lstrMethods, | 
|  | 1130 		fInvertDiversity	= args.fInvertDiversity | 
|  | 1131 	) | 
|  | 1132 | 
|  | 1133 	if not dictSelectedSamples: | 
|  | 1134 		logging.error("MicroPITA:: Error, did not get a result from analysis.") | 
|  | 1135 		return -1 | 
|  | 1136 	logging.info("End microPITA") | 
|  | 1137 | 
|  | 1138 	#Log output for debugging | 
|  | 1139 	logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples)) | 
|  | 1140 | 
|  | 1141 	#Write selection to file | 
|  | 1142 	microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput) | 
|  | 1143 | 
|  | 1144 if __name__ == "__main__": | 
|  | 1145 	_main( ) |