Mercurial > repos > george-weingart > micropita
annotate MicroPITA.py @ 20:a2e1a3c2bf9a draft
Modified the dependencies to 1.0.1
| author | george-weingart |
|---|---|
| date | Thu, 11 Aug 2016 01:36:40 -0400 |
| parents | 7d25ecd225dd |
| children | 1d09ffab87a7 |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env python |
| 2 """ | |
| 3 Author: Timothy Tickle | |
| 4 Description: Class to Run analysis for the microPITA paper | |
| 5 """ | |
| 6 | |
| 7 ##################################################################################### | |
| 8 #Copyright (C) <2012> | |
| 9 # | |
| 10 #Permission is hereby granted, free of charge, to any person obtaining a copy of | |
| 11 #this software and associated documentation files (the "Software"), to deal in the | |
| 12 #Software without restriction, including without limitation the rights to use, copy, | |
| 13 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
| 14 #and to permit persons to whom the Software is furnished to do so, subject to | |
| 15 #the following conditions: | |
| 16 # | |
| 17 #The above copyright notice and this permission notice shall be included in all copies | |
| 18 #or substantial portions of the Software. | |
| 19 # | |
| 20 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
| 21 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
| 22 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
| 23 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
| 24 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
| 25 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
| 26 ##################################################################################### | |
| 27 | |
| 28 __author__ = "Timothy Tickle" | |
| 29 __copyright__ = "Copyright 2012" | |
| 30 __credits__ = ["Timothy Tickle"] | |
| 31 __license__ = "MIT" | |
| 32 __maintainer__ = "Timothy Tickle" | |
| 33 __email__ = "ttickle@sph.harvard.edu" | |
| 34 __status__ = "Development" | |
| 35 | |
| 36 import sys | |
| 37 import argparse | |
| 38 from src.breadcrumbs.src.AbundanceTable import AbundanceTable | |
|
16
7d25ecd225dd
Updated Micropita.py to suppres future warnings as this was causing a problem inn Galaxy
george.weingart@gmail.com
parents:
0
diff
changeset
|
39 import warnings |
|
7d25ecd225dd
Updated Micropita.py to suppres future warnings as this was causing a problem inn Galaxy
george.weingart@gmail.com
parents:
0
diff
changeset
|
40 warnings.simplefilter(action = "ignore", category = FutureWarning) |
| 0 | 41 from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs |
| 42 from src.breadcrumbs.src.Metric import Metric | |
| 43 from src.breadcrumbs.src.KMedoids import Kmedoids | |
| 44 from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor | |
| 45 from src.breadcrumbs.src.SVM import SVM | |
| 46 from src.breadcrumbs.src.UtilityMath import UtilityMath | |
| 47 | |
| 48 from src.ConstantsMicropita import ConstantsMicropita | |
| 49 import csv | |
| 50 import logging | |
| 51 import math | |
| 52 import mlpy | |
| 53 import numpy as np | |
| 54 import operator | |
| 55 import os | |
| 56 import random | |
| 57 import scipy.cluster.hierarchy as hcluster | |
| 58 import scipy.spatial.distance | |
| 59 from types import * | |
| 60 | |
| 61 class MicroPITA: | |
| 62 """ | |
| 63 Selects samples from a first tier of a multi-tiered study to be used in a second tier. | |
| 64 Different methods can be used for selection. | |
| 65 The expected input is an abundance table (and potentially a text file of targeted features, | |
| 66 if using the targeted features option). Output is a list of samples exhibiting the | |
| 67 characteristics of interest. | |
| 68 """ | |
| 69 | |
| 70 #Constants | |
| 71 #Diversity metrics Alpha | |
| 72 c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity | |
| 73 c_strChao1Diversity = Metric.c_strChao1Diversity | |
| 74 | |
| 75 #Diversity metrics Beta | |
| 76 c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity | |
| 77 | |
| 78 #Additive inverses of diversity metrics beta | |
| 79 c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity | |
| 80 | |
| 81 #Technique Names | |
| 82 ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C" | |
| 83 | |
| 84 #Targeted feature settings | |
| 85 c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked | |
| 86 c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance | |
| 87 | |
| 88 #Technique groupings | |
| 89 # c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2] | |
| 90 | |
| 91 #Converts ecology metrics into standardized method selection names | |
| 92 dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2} | |
| 93 # dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity} | |
| 94 dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative} | |
| 95 dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme} | |
| 96 | |
| 97 #Linkage used in the Hierarchical clustering | |
| 98 c_strHierarchicalClusterMethod = 'average' | |
| 99 | |
| 100 ####Group 1## Diversity | |
| 101 #Testing: Happy path Testing (8) | |
| 102 def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None): | |
| 103 """ | |
| 104 Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given | |
| 105 it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample | |
| 106 names associated with the indices. | |
| 107 | |
| 108 :param lldMatrix: List of lists [[value,value,value,value],[value,value,value,value]]. | |
| 109 :type: List of lists List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample. | |
| 110 :param lsSampleNames: List of sample names positionally related (the same) to each list (Optional). | |
| 111 :type: List of strings List of strings. | |
| 112 :param iTopAmount: The amount of top measured samples (assumes the higher measurements are better). | |
| 113 :type: integer Integer amount of sample names/ indices to return. | |
| 114 :return List: List of samples to be selected. | |
| 115 """ | |
| 116 topRankListRet = [] | |
| 117 for rowMetrics in lldMatrix: | |
| 118 #Create 2 d array to hold value and index and sort | |
| 119 liIndexX = [rowMetrics,range(len(rowMetrics))] | |
| 120 liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True) | |
| 121 | |
| 122 if lsSampleNames: | |
| 123 topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]]) | |
| 124 else: | |
| 125 topRankListRet.append(liIndexX[1][:iTopAmount]) | |
| 126 | |
| 127 return topRankListRet | |
| 128 | |
| 129 ####Group 2## Representative Dissimilarity | |
| 130 #Testing: Happy path tested 1 | |
| 131 def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): | |
| 132 """ | |
| 133 Gets centroid samples by k-medoids clustering of a given matrix. | |
| 134 | |
| 135 :param npaMatrix: Numpy array where row=features and columns=samples | |
| 136 :type: Numpy array Abundance Data. | |
| 137 :param sMetric: String name of beta metric used as the distance metric. | |
| 138 :type: String String name of beta metric. | |
| 139 :param lsSampleNames: The names of the sample | |
| 140 :type: List List of strings | |
| 141 :param iNumberSamplesReturned: Number of samples to return, each will be a centroid of a sample. | |
| 142 :type: Integer Number of samples to return | |
| 143 :return List: List of selected samples. | |
| 144 :param istmBetaMatrix: File with beta-diversity matrix | |
| 145 :type: File stream or file path string | |
| 146 """ | |
| 147 | |
| 148 #Count of how many rows | |
| 149 sampleCount = npaMatrix.shape[0] | |
| 150 if iNumberSamplesReturned > sampleCount: | |
| 151 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".") | |
| 152 return False | |
| 153 | |
| 154 #If the cluster count is equal to the sample count return all samples | |
| 155 if sampleCount == iNumberSamplesReturned: | |
| 156 return list(lsSampleNames) | |
| 157 | |
| 158 #Get distance matrix | |
| 159 distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames) | |
| 160 if type(distanceMatrix) is BooleanType: | |
| 161 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.") | |
| 162 return False | |
| 163 | |
| 164 # Handle unifrac output | |
| 165 if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: | |
| 166 distanceMatrix = distanceMatrix[0] | |
| 167 | |
| 168 #Log distance matrix | |
| 169 logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric)) | |
| 170 | |
| 171 distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True) | |
| 172 | |
| 173 #Create object to determine clusters/medoids | |
| 174 medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance) | |
| 175 #medoidsData includes(1d numpy array, medoids indexes; | |
| 176 # 1d numpy array, non-medoids indexes; | |
| 177 # 1d numpy array, cluster membership for non-medoids; | |
| 178 # double, cost of configuration) | |
| 179 #npaMatrix is samples x rows | |
| 180 #Build a matrix of lists of indicies to pass to the distance matrix | |
| 181 lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))] | |
| 182 medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix)) | |
| 183 logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:") | |
| 184 logging.debug(str(medoidsData)) | |
| 185 | |
| 186 #If returning the same amount of clusters and samples | |
| 187 #Return centroids | |
| 188 selectedIndexes = medoidsData[0] | |
| 189 return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)] | |
| 190 | |
| 191 ####Group 3## Highest Dissimilarity | |
| 192 #Testing: Happy path tested | |
| 193 def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): | |
| 194 """ | |
| 195 Select extreme samples from HClustering. | |
| 196 | |
| 197 :param strBetaMetric: The beta metric to use for distance matrix generation. | |
| 198 :type: String The name of the beta metric to use. | |
| 199 :param npaAbundanceMatrix: Numpy array where row=samples and columns=features. | |
| 200 :type: Numpy Array Abundance data. | |
| 201 :param lsSampleNames: The names of the sample. | |
| 202 :type: List List of strings. | |
| 203 :param iSelectSampleCount: Number of samples to select (return). | |
| 204 :type: Integer Integer number of samples returned. | |
| 205 :return Samples: List of samples. | |
| 206 :param istmBetaMatrix: File with beta-diversity matrix | |
| 207 :type: File stream or file path string | |
| 208 """ | |
| 209 | |
| 210 #If they want all the sample count, return all sample names | |
| 211 iSampleCount=len(npaAbundanceMatrix[:,0]) | |
| 212 if iSelectSampleCount==iSampleCount: | |
| 213 return lsSampleNames | |
| 214 | |
| 215 #Holds the samples to be returned | |
| 216 lsReturnSamplesRet = [] | |
| 217 | |
| 218 #Generate beta matrix | |
| 219 #Returns condensed matrix | |
| 220 tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True) | |
| 221 | |
| 222 if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: | |
| 223 tempDistanceMatrix = tempDistanceMatrix[0] | |
| 224 | |
| 225 if type(tempDistanceMatrix) is BooleanType: | |
| 226 logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.") | |
| 227 return False | |
| 228 | |
| 229 if istmBetaMatrix: | |
| 230 tempDistanceMatrix = 1-tempDistanceMatrix | |
| 231 | |
| 232 #Feed beta matrix to linkage to cluster | |
| 233 #Send condensed matrix | |
| 234 linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod) | |
| 235 | |
| 236 #Extract cluster information from dendrogram | |
| 237 #The linakge matrix is of the form | |
| 238 #[[int1 int2 doube int3],...] | |
| 239 #int1 and int1 are the paired samples indexed at 0 and up. | |
| 240 #each list is an entry for a branch that is number starting with the first | |
| 241 #list being sample count index + 1 | |
| 242 #each list is then named by an increment as they appear | |
| 243 #this means that if a number is in the list and is = sample count or greater it is not | |
| 244 #terminal and is instead a branch. | |
| 245 #This method just takes the lowest metric measurement (highest distance pairs/clusters) | |
| 246 #Works much better than the original technique | |
| 247 #get total number of samples | |
| 248 | |
| 249 iCurrentSelectCount = 0 | |
| 250 for row in linkageMatrix: | |
| 251 #Get nodes ofthe lowest pairing (so the furthest apart pair) | |
| 252 iNode1 = int(row[0]) | |
| 253 iNode2 = int(row[1]) | |
| 254 #Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram | |
| 255 #The branching in the dendrogram will start at the number of samples and increment higher. | |
| 256 #Add each of the pair one at a time breaking when enough samples are selected. | |
| 257 if iNode1<iSampleCount: | |
| 258 lsReturnSamplesRet.append(lsSampleNames[iNode1]) | |
| 259 iCurrentSelectCount = iCurrentSelectCount + 1 | |
| 260 if iCurrentSelectCount == iSelectSampleCount: | |
| 261 break | |
| 262 if iNode2<iSampleCount: | |
| 263 lsReturnSamplesRet.append(lsSampleNames[iNode2]) | |
| 264 iCurrentSelectCount = iCurrentSelectCount + 1 | |
| 265 if iCurrentSelectCount == iSelectSampleCount: | |
| 266 break | |
| 267 | |
| 268 #Return selected samples | |
| 269 return lsReturnSamplesRet | |
| 270 | |
| 271 ####Group 4## Rank Average of user Defined Taxa | |
| 272 #Testing: Happy Path Tested | |
| 273 def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False): | |
| 274 """ | |
| 275 Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped. | |
| 276 | |
| 277 :param abndTable: Abundance Table to analyse | |
| 278 :type: AbundanceTable Abundance Table | |
| 279 :param lsTargetedFeature: String names | |
| 280 :type: list list of string names of features (bugs) which are measured after ranking against the full sample | |
| 281 :param fRank: Indicates to rank the abundance before getting the average abundance of the features (default false) | |
| 282 :type: boolean Flag indicating ranking abundance before calculating average feature measurement (false= no ranking) | |
| 283 :return List of lists or boolean: List of lists or False on error. One internal list per sample indicating the sample, | |
| 284 feature average abundance or ranked abundance. Lists will already be sorted. | |
| 285 For not Ranked [[sample,average abundance of selected feature,1]] | |
| 286 For Ranked [[sample,average ranked abundance, average abundance of selected feature]] | |
| 287 Error Returns false | |
| 288 """ | |
| 289 | |
| 290 llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature) | |
| 291 if not llAbundance: | |
| 292 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") | |
| 293 return False | |
| 294 #Add a space for ranking if needed | |
| 295 #Not ranked will be [[sSample,average abundance,1]] | |
| 296 #(where 1 will not discriminant ties if used in later functions, so this generalizes) | |
| 297 #Ranked will be [[sSample, average rank, average abundance]] | |
| 298 llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance] | |
| 299 #Rank if needed | |
| 300 if fRank: | |
| 301 abndRanked = abndTable.funcRankAbundance() | |
| 302 if abndRanked == None: | |
| 303 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.") | |
| 304 return False | |
| 305 llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature) | |
| 306 if not llRetRank: | |
| 307 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") | |
| 308 return False | |
| 309 dictRanks = dict(llRetRank) | |
| 310 llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance] | |
| 311 | |
| 312 #Sort first for ties and then for the main feature | |
| 313 if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity: | |
| 314 llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank) | |
| 315 if fRank: | |
| 316 llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank) | |
| 317 return llRetAbundance | |
| 318 | |
| 319 #Testing: Happy Path Tested | |
| 320 def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]): | |
| 321 """ | |
| 322 Selects samples with the highest ranks or abundance of targeted features. | |
| 323 If ranked, select the highest abundance for tie breaking | |
| 324 | |
| 325 :param abndMatrix: Abundance table to analyse | |
| 326 :type: AbundanceTable Abundance table | |
| 327 :param lsTargetedTaxa: List of features | |
| 328 :type: list list of strings | |
| 329 :param iSampleSelectionCount: Number of samples to select | |
| 330 :type: integer integer | |
| 331 :param sMethod: Method to select targeted features | |
| 332 :type: string String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues) | |
| 333 :return List of strings: List of sample names which were selected | |
| 334 List of strings Empty list is returned on an error. | |
| 335 """ | |
| 336 | |
| 337 #Check data | |
| 338 if(len(lsTargetedTaxa) < 1): | |
| 339 logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.") | |
| 340 return [] | |
| 341 | |
| 342 lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa, | |
| 343 fRank=sMethod.lower() == self.c_strTargetedRanked.lower()) | |
| 344 #If an error occured or the key word for the method was not recognized | |
| 345 if lsTargetedSamples == False: | |
| 346 logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.") | |
| 347 return [] | |
| 348 | |
| 349 #Select from results | |
| 350 return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]] | |
| 351 | |
| 352 ####Group 5## Random | |
| 353 #Testing: Happy path Tested | |
| 354 def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0): | |
| 355 """ | |
| 356 Returns random sample names of the number given. No replacement. | |
| 357 | |
| 358 :param lsSamples: List of sample names | |
| 359 :type: list list of strings | |
| 360 :param iNumberOfSamplesToReturn: Number of samples to select | |
| 361 :type: integer integer. | |
| 362 :return List: List of selected samples (strings). | |
| 363 """ | |
| 364 | |
| 365 #Input matrix sample count | |
| 366 sampleCount = len(lsSamples) | |
| 367 | |
| 368 #Return the full matrix if they ask for a return matrix where length == original | |
| 369 if(iNumberOfSamplesToReturn >= sampleCount): | |
| 370 return lsSamples | |
| 371 | |
| 372 #Get the random indices for the sample (without replacement) | |
| 373 liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn) | |
| 374 | |
| 375 #Create a boolean array of if indexes are to be included in the reduced array | |
| 376 return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices] | |
| 377 | |
| 378 #Happy path tested (case 3) | |
| 379 def funcGetAveragePopulation(self, abndTable, lfCompress): | |
| 380 """ | |
| 381 Get the average row per column in the abndtable. | |
| 382 | |
| 383 :param abndTable: AbundanceTable of data to be averaged | |
| 384 :type: AbudanceTable | |
| 385 :param lfCompress: List of boolean flags (false means to remove sample before averaging | |
| 386 :type: List of floats | |
| 387 :return List of doubles: | |
| 388 """ | |
| 389 if sum(lfCompress) == 0: | |
| 390 return [] | |
| 391 | |
| 392 #Get the average populations | |
| 393 lAverageRet = [] | |
| 394 | |
| 395 for sFeature in abndTable.funcGetAbundanceCopy(): | |
| 396 sFeature = list(sFeature)[1:] | |
| 397 sFeature=np.compress(lfCompress,sFeature,axis=0) | |
| 398 lAverageRet.append(sum(sFeature)/float(len(sFeature))) | |
| 399 return lAverageRet | |
| 400 | |
| 401 #Happy path tested (2 cases) | |
| 402 def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected): | |
| 403 """ | |
| 404 Given an abundance table and an average sample, this returns the distance of each sample | |
| 405 (measured using brays-curtis dissimilarity) from the average. | |
| 406 The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected | |
| 407 (which is associated with the samples in the order of the samples in the abundance table; | |
| 408 use abundancetable.funcGetSampleNames() to see the order if needed). | |
| 409 | |
| 410 :param abndTable: Abundance table holding the data to be analyzed. | |
| 411 :type: AbundanceTable | |
| 412 :param ldAverage: Average population (Average features of the abundance table of samples) | |
| 413 :type: List of doubles which represent the average population | |
| 414 :param lsSamples: These are the only samples used in the analysis | |
| 415 :type: List of strings (sample ids) | |
| 416 :param lfSelected: Samples to be included in the analysis | |
| 417 :type: List of boolean (true means include) | |
| 418 :return: List of distances (doubles) | |
| 419 """ | |
| 420 #Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists | |
| 421 ldSelectedDistances = [] | |
| 422 | |
| 423 for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]: | |
| 424 #Get the sample measurements | |
| 425 ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0]) | |
| 426 return ldSelectedDistances | |
| 427 | |
| 428 #Happy path tested (1 case) | |
| 429 def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther): | |
| 430 """ | |
| 431 Get the distance of samples from one label from the average sample of not the label. | |
| 432 Note: This assumes 2 classes. | |
| 433 | |
| 434 :param abndTable: Table of data to work out of. | |
| 435 :type: Abundace Table | |
| 436 :param lfGroupOfInterest: Boolean indicator of the sample being in the first group. | |
| 437 :type: List of floats, true indicating an individual in the group of interest. | |
| 438 :param lfGroupOther: Boolean indicator of the sample being in the other group. | |
| 439 :type: List of floats, true indicating an individual in the | |
| 440 :return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population] | |
| 441 """ | |
| 442 #Get all sample names | |
| 443 lsAllSamples = abndTable.funcGetSampleNames() | |
| 444 | |
| 445 #Get average populations | |
| 446 lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther) | |
| 447 | |
| 448 #Get the distance from the average of the other label (label 1) | |
| 449 ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther, | |
| 450 lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest) | |
| 451 | |
| 452 return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances) | |
| 453 | |
| 454 #Happy path tested (1 test case) | |
| 455 def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest): | |
| 456 """ | |
| 457 Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group. | |
| 458 An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group. | |
| 459 | |
| 460 :params abndTable: Abundance of measurements | |
| 461 :type: AbundanceTable | |
| 462 :params iSelectionCount: The number of samples selected per sample. | |
| 463 :type: Integer Integer greater than 0 | |
| 464 :params sLabel: ID of the metadata which is the supervised label | |
| 465 :type: String | |
| 466 :params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest. | |
| 467 :type: String found in the abundance table metadata row indicated by sLabel. | |
| 468 :return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]] | |
| 469 """ | |
| 470 | |
| 471 lsMetadata = abndTable.funcGetMetadata(sLabel) | |
| 472 #Other metadata values | |
| 473 lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest)) | |
| 474 | |
| 475 #Get boolean indicator of values of interest | |
| 476 lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata] | |
| 477 | |
| 478 #Get the distances of the items of interest from the other metadata values | |
| 479 dictDistanceAverages = {} | |
| 480 for sOtherLabel in lsUniqueOtherValues: | |
| 481 #Get boolean indicator of labels not of interest | |
| 482 lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata] | |
| 483 | |
| 484 #Get the distances of data from two different groups to the average of the other | |
| 485 ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther)) | |
| 486 | |
| 487 for sKey in ldValueDistances: | |
| 488 dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey] | |
| 489 | |
| 490 #Finish average by dividing by length of lsUniqueOtherValues | |
| 491 ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages] | |
| 492 | |
| 493 #Sort to extract extremes | |
| 494 ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1)) | |
| 495 | |
| 496 #Get the closest and farthest distances | |
| 497 ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount] | |
| 498 ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:] | |
| 499 | |
| 500 #Remove the selected samples from the larger population of distances (better visualization) | |
| 501 ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples] | |
| 502 | |
| 503 #Return discriminant tuples, distinct tuples, other tuples | |
| 504 return [ltupleDiscriminantSamples, ltupleDistinctSamples, | |
| 505 [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]] | |
| 506 | |
| 507 #Run the supervised method surrounding distance from centroids | |
| 508 #Happy path tested (3 test cases) | |
| 509 def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant, | |
| 510 xOutputSupFile, xPredictSupFile, strSupervisedMetadata, | |
| 511 iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False): | |
| 512 """ | |
| 513 Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group. | |
| 514 | |
| 515 :param abundanceTable: AbundanceTable | |
| 516 :type: AbudanceTable Data to analyze | |
| 517 :param fRunDistinct: Run distinct selection method | |
| 518 :type: Boolean boolean (true runs method) | |
| 519 :param fRunDiscriminant: Run discriminant method | |
| 520 :type: Boolean boolean (true runs method) | |
| 521 :param xOutputSupFile: File output from supervised methods detailing data going into the method. | |
| 522 :type: String or FileStream | |
| 523 :param xPredictSupFile: File output from supervised methods distance results from supervised methods. | |
| 524 :type: String or FileStream | |
| 525 :param strSupervisedMetadata: The metadata that will be used to group samples. | |
| 526 :type: String | |
| 527 :param iSampleSupSelectionCount: Number of samples to select | |
| 528 :type: Integer int sample selection count | |
| 529 :param lsOriginalSampleNames: List of the sample names, order is important and should be preserved from the abundanceTable. | |
| 530 :type: List of samples | |
| 531 :param fAppendFiles: Indicates that output files already exist and appending is occuring. | |
| 532 :type: Boolean | |
| 533 :return Selected Samples: A dictionary of selected samples by selection ID | |
| 534 Dictionary {"Selection Method":["SampleID","SampleID"...]} | |
| 535 """ | |
| 536 #Get labels and run one label against many | |
| 537 lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata) | |
| 538 dictlltpleDistanceMeasurements = {} | |
| 539 for sMetadataValue in set(lstrMetadata): | |
| 540 | |
| 541 #For now perform the selection here for the label of interest against the other labels | |
| 542 dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable, | |
| 543 iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue)) | |
| 544 | |
| 545 #Make expected output files for supervised methods | |
| 546 #1. Output file which is similar to an input file for SVMs | |
| 547 #2. Output file that is similar to the probabilitic output of a SVM (LibSVM) | |
| 548 #Manly for making output of supervised methods (Distance from Centroid) similar | |
| 549 #MicropitaVis needs some of these files | |
| 550 if xOutputSupFile: | |
| 551 if fAppendFiles: | |
| 552 SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, | |
| 553 lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) | |
| 554 else: | |
| 555 SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, | |
| 556 sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) | |
| 557 | |
| 558 #Will contain the samples selected to return | |
| 559 #One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type | |
| 560 dictSelectedSamplesRet = dict() | |
| 561 for sKey, ltplDistances in dictlltpleDistanceMeasurements.items(): | |
| 562 if fRunDistinct: | |
| 563 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]]) | |
| 564 if fRunDiscriminant: | |
| 565 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]]) | |
| 566 | |
| 567 if xPredictSupFile: | |
| 568 dictFlattenedDistances = dict() | |
| 569 [dictFlattenedDistances.setdefault(sKey, []).append(tple) | |
| 570 for sKey, lltple in dictlltpleDistanceMeasurements.items() | |
| 571 for ltple in lltple for tple in ltple] | |
| 572 if fAppendFiles: | |
| 573 self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, | |
| 574 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) | |
| 575 else: | |
| 576 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, | |
| 577 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) | |
| 578 return dictSelectedSamplesRet | |
| 579 | |
| 580 #Two happy path test cases | |
| 581 def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames): | |
| 582 """ | |
| 583 Manages updating the predict file. | |
| 584 | |
| 585 :param xPredictSupFile: File that has predictions (distances) from the supervised method. | |
| 586 :type: FileStream or String file path | |
| 587 :param xInputLabelsFile: File that as input to the supervised methods. | |
| 588 :type: FileStream or String file path | |
| 589 :param dictltpleDistanceMeasurements: | |
| 590 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} | |
| 591 """ | |
| 592 | |
| 593 if not isinstance(xPredictSupFile, str): | |
| 594 xPredictSupFile.close() | |
| 595 xPredictSupFile = xPredictSupFile.name | |
| 596 csvr = open(xPredictSupFile,'r') | |
| 597 | |
| 598 f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) | |
| 599 lsHeader = f.next()[1:] | |
| 600 dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader]) | |
| 601 | |
| 602 #Read data in | |
| 603 iSampleIndex = 0 | |
| 604 for sRow in f: | |
| 605 sLabel = sRow[0] | |
| 606 [dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:]) | |
| 607 if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue] | |
| 608 iSampleIndex += 1 | |
| 609 | |
| 610 #Combine dictltpleDistanceMeasurements with new data | |
| 611 #If they share a key then merge keeping parameter data | |
| 612 #If they do not share the key, keep the full data | |
| 613 dictNew = {} | |
| 614 for sKey in dictltpleDistanceMeasurements.keys(): | |
| 615 lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]] | |
| 616 dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey] | |
| 617 for sKey in dictlltpleRead: | |
| 618 if sKey not in dictltpleDistanceMeasurements.keys(): | |
| 619 dictNew[sKey] = dictlltpleRead[sKey] | |
| 620 | |
| 621 #Call writer | |
| 622 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile, | |
| 623 dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable, | |
| 624 lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True) | |
| 625 | |
| 626 #2 happy path test cases | |
| 627 def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False): | |
| 628 """ | |
| 629 Write to the predict file. | |
| 630 | |
| 631 :param xPredictSupFile: File that has predictions (distances) from the supervised method. | |
| 632 :type: FileStream or String file path | |
| 633 :param xInputLabelsFile: File that as input to the supervised methods. | |
| 634 :type: FileStream or String file path | |
| 635 :param dictltpleDistanceMeasurements: | |
| 636 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} | |
| 637 :param abundanceTable: An abundance table of the sample data. | |
| 638 :type: AbundanceTable | |
| 639 :param lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing. | |
| 640 Otherwise will use the sample names from the abundance table. | |
| 641 :type: List of strings | |
| 642 :param fFromUpdate: Indicates if this is part of an update to the file or not. | |
| 643 :type: Boolean | |
| 644 """ | |
| 645 | |
| 646 xInputLabelsFileName = xInputLabelsFile | |
| 647 if not isinstance(xInputLabelsFile,str): | |
| 648 xInputLabelsFileName = xInputLabelsFile.name | |
| 649 f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) | |
| 650 | |
| 651 lsAllSampleNames = abundanceTable.funcGetSampleNames() | |
| 652 lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames, | |
| 653 isPredictFile=False) | |
| 654 dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]]) | |
| 655 | |
| 656 #Dictionay keys will be used to order the predict file | |
| 657 lsMeasurementKeys = dictltpleDistanceMeasurements.keys() | |
| 658 #Make header | |
| 659 f.writerow(["labels"]+lsMeasurementKeys) | |
| 660 | |
| 661 #Reformat dictionary to make it easier to use | |
| 662 for sKey in dictltpleDistanceMeasurements: | |
| 663 dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]]) | |
| 664 | |
| 665 for sSample in lsOriginalSampleNames: | |
| 666 #Make body of file | |
| 667 f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+ | |
| 668 [str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)) | |
| 669 for sKey in lsMeasurementKeys]) | |
| 670 | |
| 671 def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics, | |
| 672 fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None, | |
| 673 istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False): | |
| 674 """ | |
| 675 Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other | |
| 676 for the set that should be normalized. | |
| 677 | |
| 678 :param abndData: Abundance table object holding the samples to be measured. | |
| 679 :type: AbundanceTable | |
| 680 :param iSampleSelectionCount The number of samples to select per method. | |
| 681 :type: Integer | |
| 682 :param dictSelectedSamples Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}. | |
| 683 :type: Dictionary | |
| 684 :param lsAlphaMetrics: List of alpha metrics to use on alpha metric dependent assays (like highest diversity). | |
| 685 :type: List of strings | |
| 686 :param lsBetaMetrics: List of beta metrics to use on beta metric dependent assays (like most representative). | |
| 687 :type: List of strings | |
| 688 :param lsInverseBetaMetrics: List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar). | |
| 689 :type: List of strings | |
| 690 :param fRunDiversity: Run Diversity based methods (true indicates run). | |
| 691 :type: Boolean | |
| 692 :param fRunRepresentative: Run Representative based methods (true indicates run). | |
| 693 :type: Boolean | |
| 694 :param fRunExtreme: Run Extreme based methods (true indicates run). | |
| 695 :type: Boolean | |
| 696 :param istmBetaMatrix: File that has a precalculated beta matrix | |
| 697 :type: File stream or File path string | |
| 698 :return Selected Samples: Samples selected by methods. | |
| 699 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} | |
| 700 """ | |
| 701 | |
| 702 #Sample ids/names | |
| 703 lsSampleNames = abndData.funcGetSampleNames() | |
| 704 | |
| 705 #Generate alpha metrics and get most diverse | |
| 706 if fRunDiversity: | |
| 707 | |
| 708 #Get Alpha metrics matrix | |
| 709 internalAlphaMatrix = None | |
| 710 #Name of technique | |
| 711 strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics | |
| 712 | |
| 713 #If given an alpha-diversity metadata | |
| 714 if strAlphaMetadata: | |
| 715 internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]] | |
| 716 else: | |
| 717 #Expects Observations (Taxa (row) x sample (column)) | |
| 718 #Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]] | |
| 719 internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy() | |
| 720 if not abndData.funcIsSummed() | |
| 721 else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(), | |
| 722 lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics) | |
| 723 | |
| 724 if internalAlphaMatrix: | |
| 725 #Invert measurments | |
| 726 if fInvertDiversity: | |
| 727 lldNewDiversity = [] | |
| 728 for lsLine in internalAlphaMatrix: | |
| 729 lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine]) | |
| 730 internalAlphaMatrix = lldNewDiversity | |
| 731 #Get top ranked alpha diversity by most diverse | |
| 732 #Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...] | |
| 733 #Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]] | |
| 734 mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount) | |
| 735 | |
| 736 #Add to results | |
| 737 for index in xrange(0,len(strMethod)): | |
| 738 strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index]) | |
| 739 dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index]) | |
| 740 | |
| 741 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b") | |
| 742 logging.info(dictSelectedSamples) | |
| 743 | |
| 744 #Generate beta metrics and | |
| 745 if fRunRepresentative or fRunExtreme: | |
| 746 | |
| 747 #Abundance matrix transposed | |
| 748 npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True) | |
| 749 | |
| 750 #Get center selection using clusters/tiling | |
| 751 #This will be for beta metrics in normalized space | |
| 752 if fRunRepresentative: | |
| 753 | |
| 754 if istmBetaMatrix: | |
| 755 #Get representative dissimilarity samples | |
| 756 medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | |
| 757 | |
| 758 if medoidSamples: | |
| 759 dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples) | |
| 760 else: | |
| 761 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.") | |
| 762 for bMetric in lsBetaMetrics: | |
| 763 | |
| 764 #Get representative dissimilarity samples | |
| 765 medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | |
| 766 | |
| 767 if medoidSamples: | |
| 768 dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples) | |
| 769 | |
| 770 #Get extreme selection using clusters, tiling | |
| 771 if fRunExtreme: | |
| 772 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.") | |
| 773 if istmBetaMatrix: | |
| 774 | |
| 775 #Samples for representative dissimilarity | |
| 776 #This involves inverting the distance metric, | |
| 777 #Taking the dendrogram level of where the number cluster == the number of samples to select | |
| 778 #Returning a repersentative sample from each cluster | |
| 779 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | |
| 780 | |
| 781 #Add selected samples | |
| 782 if extremeSamples: | |
| 783 dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples) | |
| 784 | |
| 785 else: | |
| 786 #Run KMedoids with inverse custom distance metric in normalized space | |
| 787 for bMetric in lsInverseBetaMetrics: | |
| 788 | |
| 789 #Samples for representative dissimilarity | |
| 790 #This involves inverting the distance metric, | |
| 791 #Taking the dendrogram level of where the number cluster == the number of samples to select | |
| 792 #Returning a repersentative sample from each cluster | |
| 793 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | |
| 794 | |
| 795 #Add selected samples | |
| 796 if extremeSamples: | |
| 797 dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples) | |
| 798 | |
| 799 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b") | |
| 800 logging.info(dictSelectedSamples) | |
| 801 return dictSelectedSamples | |
| 802 | |
| 803 def funcRun(self, strIDName, strLastMetadataName, istmInput, | |
| 804 ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput, | |
| 805 cDelimiter, cFeatureNameDelimiter, strFeatureSelection, | |
| 806 istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None, | |
| 807 strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None, | |
| 808 iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False): | |
| 809 """ | |
| 810 Manages the selection of samples given different metrics. | |
| 811 | |
| 812 :param strIDName: Sample Id metadata row | |
| 813 :type: String | |
| 814 :param strLastMetadataName: The id of the metadata positioned last in the abundance table. | |
| 815 :type: String String metadata id. | |
| 816 :param istmInput: File to store input data to supervised methods. | |
| 817 :type: FileStream of String file path | |
| 818 :param ostmInputPredictFile: File to store distances from supervised methods. | |
| 819 :type: FileStream or String file path | |
| 820 :param ostmCheckedFile: File to store the AbundanceTable data after it is being checked. | |
| 821 :type: FileStream or String file path | |
| 822 :param ostmOutPut: File to store sample selection by methods of interest. | |
| 823 :type: FileStream or String file path | |
| 824 :param cDelimiter: Delimiter of abundance table. | |
| 825 :type: Character Char (default TAB). | |
| 826 :param cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades). | |
| 827 :type: Character (default |). | |
| 828 :param stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance). | |
| 829 :type: String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues). | |
| 830 :param istmFeatures: File which holds the features of interest if using targeted feature methodology. | |
| 831 :type: FileStream or String file path | |
| 832 :param iCount: Number of samples to select in each methods, supervised methods select this amount per label if possible. | |
| 833 :type: Integer integer. | |
| 834 :param lstrMethods: List of strings indicating selection techniques. | |
| 835 :type: List of string method names | |
| 836 :param strLabel: The metadata used for supervised labels. | |
| 837 :type: String | |
| 838 :param strStratify: The metadata used to stratify unsupervised data. | |
| 839 :type: String | |
| 840 :param strCustomAlpha: Custom alpha diversity metric | |
| 841 :type: String | |
| 842 :param strCustomBeta: Custom beta diversity metric | |
| 843 :type: String | |
| 844 :param strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling | |
| 845 :type: String | |
| 846 :param istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling | |
| 847 :type: FileStream or String file path | |
| 848 :param istrmTree: File containing tree for phylogentic beta-diversity analysis | |
| 849 :type: FileStream or String file path | |
| 850 :param istrmEnvr: File containing environment for phylogentic beta-diversity analysis | |
| 851 :type: FileStream or String file path | |
| 852 :param iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples. | |
| 853 :type: Integer | |
| 854 :param iMinSamples: Minimum sample count for the occurence filter. | |
| 855 :type: Integer | |
| 856 :param fInvertDiversity: When true will invert diversity measurements before using. | |
| 857 :type: boolean | |
| 858 :return Selected Samples: Samples selected by methods. | |
| 859 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} | |
| 860 """ | |
| 861 | |
| 862 #Holds the top ranked samples from different metrics | |
| 863 #dict[metric name] = [samplename,samplename...] | |
| 864 selectedSamples = dict() | |
| 865 | |
| 866 #If a target feature file is given make sure that targeted feature is in the selection methods, if not add | |
| 867 if ConstantsMicropita.c_strFeature in lstrMethods: | |
| 868 if not istmFeatures: | |
| 869 logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.") | |
| 870 return False | |
| 871 | |
| 872 #Diversity metrics to run | |
| 873 #Use custom metrics if specified | |
| 874 #Custom beta metrics set to normalized only, custom alpha metrics set to count only | |
| 875 diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity] | |
| 876 diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity] | |
| 877 # inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity] | |
| 878 diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else [] | |
| 879 diversityMetricsBetaNoNormalize = [] | |
| 880 # inverseDiversityMetricsBetaNoNormalize = [] | |
| 881 | |
| 882 #Targeted taxa | |
| 883 userDefinedTaxa = [] | |
| 884 | |
| 885 #Perform different flows flags | |
| 886 c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods | |
| 887 c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods | |
| 888 c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods | |
| 889 c_RUN_RANK_AVERAGE_USER_4 = False | |
| 890 if ConstantsMicropita.c_strFeature in lstrMethods: | |
| 891 c_RUN_RANK_AVERAGE_USER_4 = True | |
| 892 if not istmFeatures: | |
| 893 logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.") | |
| 894 return False | |
| 895 #Read in taxa list, break down to lines and filter out empty strings | |
| 896 userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines())) | |
| 897 c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods | |
| 898 c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods | |
| 899 c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods | |
| 900 | |
| 901 #Read in abundance data | |
| 902 #Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0 | |
| 903 #Abundance table object to read in and manage data | |
| 904 totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples], | |
| 905 cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata, | |
| 906 sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile) | |
| 907 if not totalAbundanceTable: | |
| 908 logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+ | |
| 909 " This often occurs when the Last Metadata is not specified correctly."+ | |
| 910 " Please check to make sure the Last Metadata selection is the row of the last metadata,"+ | |
| 911 " all values after this selection should be microbial measurements and should be numeric.") | |
| 912 return False | |
| 913 | |
| 914 lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel | |
| 915 | |
| 916 dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy() | |
| 917 logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata)) | |
| 918 #If there is only 1 unique value for the labels, do not run the Supervised methods | |
| 919 if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ): | |
| 920 logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[]))) | |
| 921 return False | |
| 922 | |
| 923 #Run unsupervised methods### | |
| 924 #Stratify the data if need be and drop the old data | |
| 925 lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable] | |
| 926 | |
| 927 #For each stratified abundance block or for the unstratfified abundance | |
| 928 #Run the unsupervised blocks | |
| 929 fAppendSupFiles = False | |
| 930 for stratAbundanceTable in lStratifiedAbundanceTables: | |
| 931 logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName()) | |
| 932 | |
| 933 ###NOT SUMMED, NOT NORMALIZED | |
| 934 #Only perform if the data is not yet normalized | |
| 935 if not stratAbundanceTable.funcIsNormalized( ): | |
| 936 #Need to first work with unnormalized data | |
| 937 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: | |
| 938 | |
| 939 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, | |
| 940 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize, | |
| 941 lsBetaMetrics=diversityMetricsBetaNoNormalize, | |
| 942 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize, | |
| 943 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, | |
| 944 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata, | |
| 945 istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) | |
| 946 | |
| 947 | |
| 948 #Generate selection by the rank average of user defined taxa | |
| 949 #Expects (Taxa (row) by Samples (column)) | |
| 950 #Expects a column 0 of taxa id that is skipped | |
| 951 #Returns [(sample name,average,rank)] | |
| 952 #SUMMED AND NORMALIZED | |
| 953 stratAbundanceTable.funcSumClades() | |
| 954 #Normalize data at this point | |
| 955 stratAbundanceTable.funcNormalize() | |
| 956 if c_RUN_RANK_AVERAGE_USER_4: | |
| 957 selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable, | |
| 958 lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection) | |
| 959 logging.info("MicroPITA.funcRun:: Selected Samples Rank") | |
| 960 logging.info(selectedSamples) | |
| 961 | |
| 962 ###SUMMED AND NORMALIZED analysis block | |
| 963 #Diversity based metric will move reduce to terminal taxa as needed | |
| 964 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: | |
| 965 | |
| 966 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, | |
| 967 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha, | |
| 968 lsBetaMetrics=diversityMetricsBeta, | |
| 969 lsInverseBetaMetrics=diversityMetricsBeta, | |
| 970 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, | |
| 971 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, | |
| 972 istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) | |
| 973 | |
| 974 #5::Select randomly | |
| 975 #Expects sampleNames = List of sample names [name, name, name...] | |
| 976 if(c_RUN_RANDOM_5): | |
| 977 #Select randomly from sample names | |
| 978 selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount) | |
| 979 logging.info("MicroPITA.funcRun:: Selected Samples Random") | |
| 980 logging.info(selectedSamples) | |
| 981 | |
| 982 #Perform supervised selection | |
| 983 if c_RUN_DISTINCT or c_RUN_DISCRIMINANT: | |
| 984 if strLabel: | |
| 985 dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable, | |
| 986 fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT, | |
| 987 xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile, | |
| 988 strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount, | |
| 989 lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(), | |
| 990 lsOriginalLabels = lsOriginalLabels, | |
| 991 fAppendFiles=fAppendSupFiles) | |
| 992 | |
| 993 [selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()] | |
| 994 | |
| 995 if not fAppendSupFiles: | |
| 996 fAppendSupFiles = True | |
| 997 logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised") | |
| 998 logging.info(selectedSamples) | |
| 999 return selectedSamples | |
| 1000 | |
| 1001 #Testing: Happy path tested | |
| 1002 @staticmethod | |
| 1003 def funcWriteSelectionToFile(dictSelection,xOutputFilePath): | |
| 1004 """ | |
| 1005 Writes the selection of samples by method to an output file. | |
| 1006 | |
| 1007 :param dictSelection: The dictionary of selections by method to be written to a file. | |
| 1008 :type: Dictionary The dictionary of selections by method {"method":["sample selected","sample selected"...]} | |
| 1009 :param xOutputFilePath: FileStream or String path to file inwhich the dictionary is written. | |
| 1010 :type: String FileStream or String path to file | |
| 1011 """ | |
| 1012 | |
| 1013 if not dictSelection: | |
| 1014 return | |
| 1015 | |
| 1016 #Open file | |
| 1017 f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim ) | |
| 1018 | |
| 1019 #Create output content from dictionary | |
| 1020 for sKey in dictSelection: | |
| 1021 f.writerow([sKey]+dictSelection[sKey]) | |
| 1022 logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey])) | |
| 1023 | |
| 1024 #Testing: Happy Path tested | |
| 1025 @staticmethod | |
| 1026 def funcReadSelectionFileToDictionary(xInputFile): | |
| 1027 """ | |
| 1028 Reads in an output selection file from micropita and formats it into a dictionary. | |
| 1029 | |
| 1030 :param xInputFile: String path to file or file stream to read and translate into a dictionary. | |
| 1031 {"method":["sample selected","sample selected"...]} | |
| 1032 :type: FileStream or String Path to file | |
| 1033 :return Dictionary: Samples selected by methods. | |
| 1034 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} | |
| 1035 """ | |
| 1036 | |
| 1037 #Open file | |
| 1038 istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim) | |
| 1039 | |
| 1040 #Dictionary to hold selection data | |
| 1041 return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader]) | |
| 1042 | |
| 1043 #Set up arguments reader | |
| 1044 argp = argparse.ArgumentParser( prog = "MicroPITA.py", | |
| 1045 description = """Selects samples from abundance tables based on various selection schemes.""" ) | |
| 1046 | |
| 1047 args = argp.add_argument_group( "Common", "Commonly modified options" ) | |
| 1048 args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp) | |
| 1049 args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp, | |
| 1050 choices = ConstantsMicropita.c_lsAllMethods, action = "append") | |
| 1051 | |
| 1052 args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" ) | |
| 1053 args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp, choices = Metric.setAlphaDiversities) | |
| 1054 args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp, choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]) | |
| 1055 args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp) | |
| 1056 args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp) | |
| 1057 args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp) | |
| 1058 args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp) | |
| 1059 args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp) | |
| 1060 | |
| 1061 args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" ) | |
| 1062 args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp) | |
| 1063 args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None, | |
| 1064 help= ConstantsMicropita.c_strLastMetadataNameHelp) | |
| 1065 args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0], | |
| 1066 choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp) | |
| 1067 args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp) | |
| 1068 args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp) | |
| 1069 | |
| 1070 args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" ) | |
| 1071 args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp) | |
| 1072 args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id", | |
| 1073 help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp) | |
| 1074 | |
| 1075 args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" ) | |
| 1076 args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp) | |
| 1077 args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp) | |
| 1078 | |
| 1079 args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" ) | |
| 1080 args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING", | |
| 1081 choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp) | |
| 1082 args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp) | |
| 1083 args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp) | |
| 1084 args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp) | |
| 1085 args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp) | |
| 1086 | |
| 1087 argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp, | |
| 1088 default = sys.stdin) | |
| 1089 argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp, | |
| 1090 default = sys.stdout) | |
| 1091 | |
| 1092 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__ | |
| 1093 | |
| 1094 def _main( ): | |
| 1095 args = argp.parse_args( ) | |
| 1096 | |
| 1097 #Set up logger | |
| 1098 iLogLevel = getattr(logging, args.strLogLevel.upper(), None) | |
| 1099 logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel) | |
| 1100 | |
| 1101 #Run micropita | |
| 1102 logging.info("MicroPITA:: Start microPITA") | |
| 1103 microPITA = MicroPITA() | |
| 1104 | |
| 1105 #Argparse will append to the default but will not remove the default so I do this here | |
| 1106 if not len(args.lstrMethods): | |
| 1107 args.lstrMethods = [ConstantsMicropita.c_strRepresentative] | |
| 1108 | |
| 1109 dictSelectedSamples = microPITA.funcRun( | |
| 1110 strIDName = args.strIDName, | |
| 1111 strLastMetadataName = args.strLastMetadataName, | |
| 1112 istmInput = args.istmInput, | |
| 1113 ostmInputPredictFile = args.ostmInputPredictFile, | |
| 1114 ostmPredictFile = args.ostmPredictFile, | |
| 1115 ostmCheckedFile = args.ostmCheckedFile, | |
| 1116 ostmOutput = args.ostmOutput, | |
| 1117 cDelimiter = args.cFileDelimiter, | |
| 1118 cFeatureNameDelimiter = args.cFeatureNameDelimiter, | |
| 1119 istmFeatures = args.istmFeatures, | |
| 1120 strFeatureSelection = args.strFeatureSelection, | |
| 1121 iCount = args.iCount, | |
| 1122 strLastRowMetadata = args.strLastFeatureMetadata, | |
| 1123 strLabel = args.strLabel, | |
| 1124 strStratify = args.strUnsupervisedStratify, | |
| 1125 strCustomAlpha = args.strAlphaDiversity, | |
| 1126 strCustomBeta = args.strBetaDiversity, | |
| 1127 strAlphaMetadata = args.strAlphaMetadata, | |
| 1128 istmBetaMatrix = args.istmBetaMatrix, | |
| 1129 istrmTree = args.istrmTree, | |
| 1130 istrmEnvr = args.istrmEnvr, | |
| 1131 lstrMethods = args.lstrMethods, | |
| 1132 fInvertDiversity = args.fInvertDiversity | |
| 1133 ) | |
| 1134 | |
| 1135 if not dictSelectedSamples: | |
| 1136 logging.error("MicroPITA:: Error, did not get a result from analysis.") | |
| 1137 return -1 | |
| 1138 logging.info("End microPITA") | |
| 1139 | |
| 1140 #Log output for debugging | |
| 1141 logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples)) | |
| 1142 | |
| 1143 #Write selection to file | |
| 1144 microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput) | |
| 1145 | |
| 1146 if __name__ == "__main__": | |
| 1147 _main( ) |
