Mercurial > repos > george-weingart > micropita
changeset 28:1d09ffab87a7 draft
Uploaded MicroPITA.py - fixed spaces and tabs inconsistencies
author | george-weingart |
---|---|
date | Tue, 22 Jun 2021 03:23:17 +0000 |
parents | d9862a9a4d84 |
children | 93259fd01929 |
files | MicroPITA.py |
diffstat | 1 files changed, 1115 insertions(+), 1012 deletions(-) [+] |
line wrap: on
line diff
--- a/MicroPITA.py Tue Jun 22 03:21:22 2021 +0000 +++ b/MicroPITA.py Tue Jun 22 03:23:17 2021 +0000 @@ -5,26 +5,45 @@ """ ##################################################################################### -#Copyright (C) <2012> +# Copyright (C) <2012> # -#Permission is hereby granted, free of charge, to any person obtaining a copy of -#this software and associated documentation files (the "Software"), to deal in the -#Software without restriction, including without limitation the rights to use, copy, -#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, -#and to permit persons to whom the Software is furnished to do so, subject to -#the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in the +# Software without restriction, including without limitation the rights to use, copy, +# modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, subject to +# the following conditions: # -#The above copyright notice and this permission notice shall be included in all copies -#or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all copies +# or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ##################################################################################### +from types import * +import scipy.spatial.distance +import scipy.cluster.hierarchy as hcluster +import random +import os +import operator +import numpy as np +import mlpy +import math +import logging +import csv +from src.ConstantsMicropita import ConstantsMicropita +from src.breadcrumbs.src.UtilityMath import UtilityMath +from src.breadcrumbs.src.SVM import SVM +from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor +from src.breadcrumbs.src.KMedoids import Kmedoids +from src.breadcrumbs.src.Metric import Metric +from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs +from src.breadcrumbs.src.AbundanceTable import AbundanceTable __author__ = "Timothy Tickle" __copyright__ = "Copyright 2012" __credits__ = ["Timothy Tickle"] @@ -35,1113 +54,1197 @@ import sys import argparse -from src.breadcrumbs.src.AbundanceTable import AbundanceTable import warnings -warnings.simplefilter(action = "ignore", category = FutureWarning) -from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs -from src.breadcrumbs.src.Metric import Metric -from src.breadcrumbs.src.KMedoids import Kmedoids -from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor -from src.breadcrumbs.src.SVM import SVM -from src.breadcrumbs.src.UtilityMath import UtilityMath +warnings.simplefilter(action="ignore", category=FutureWarning) -from src.ConstantsMicropita import ConstantsMicropita -import csv -import logging -import math -import mlpy -import numpy as np -import operator -import os -import random -import scipy.cluster.hierarchy as hcluster -import scipy.spatial.distance -from types import * class MicroPITA: - """ - Selects samples from a first tier of a multi-tiered study to be used in a second tier. - Different methods can be used for selection. - The expected input is an abundance table (and potentially a text file of targeted features, - if using the targeted features option). Output is a list of samples exhibiting the - characteristics of interest. - """ + """ + Selects samples from a first tier of a multi-tiered study to be used in a second tier. + Different methods can be used for selection. + The expected input is an abundance table (and potentially a text file of targeted features, + if using the targeted features option). Output is a list of samples exhibiting the + characteristics of interest. + """ - #Constants - #Diversity metrics Alpha - c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity - c_strChao1Diversity = Metric.c_strChao1Diversity + # Constants + # Diversity metrics Alpha + c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity + c_strChao1Diversity = Metric.c_strChao1Diversity - #Diversity metrics Beta - c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity + # Diversity metrics Beta + c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity - #Additive inverses of diversity metrics beta - c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity + # Additive inverses of diversity metrics beta + c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity - #Technique Names - ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C" + # Technique Names + ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C" - #Targeted feature settings - c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked - c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance + # Targeted feature settings + c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked + c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance - #Technique groupings + # Technique groupings # c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2] - #Converts ecology metrics into standardized method selection names - dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2} + # Converts ecology metrics into standardized method selection names + dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity: ConstantsMicropita.c_strDiversity, + c_strChao1Diversity: ConstantsMicropita.c_strDiversity2} # dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity} - dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative} - dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme} + dictConvertBMetricToMethod = { + c_strBrayCurtisDissimilarity: ConstantsMicropita.c_strRepresentative} + dictConvertInvBMetricToMethod = { + c_strBrayCurtisDissimilarity: ConstantsMicropita.c_strExtreme} + + # Linkage used in the Hierarchical clustering + c_strHierarchicalClusterMethod = 'average' - #Linkage used in the Hierarchical clustering - c_strHierarchicalClusterMethod = 'average' +# Group 1## Diversity + # Testing: Happy path Testing (8) + def funcGetTopRankedSamples(self, lldMatrix=None, lsSampleNames=None, iTopAmount=None): + """ + Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given + it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample + names associated with the indices. -####Group 1## Diversity - #Testing: Happy path Testing (8) - def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None): - """ - Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given - it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample - names associated with the indices. - - :param lldMatrix: List of lists [[value,value,value,value],[value,value,value,value]]. - :type: List of lists List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample. - :param lsSampleNames: List of sample names positionally related (the same) to each list (Optional). - :type: List of strings List of strings. - :param iTopAmount: The amount of top measured samples (assumes the higher measurements are better). - :type: integer Integer amount of sample names/ indices to return. - :return List: List of samples to be selected. - """ - topRankListRet = [] - for rowMetrics in lldMatrix: - #Create 2 d array to hold value and index and sort - liIndexX = [rowMetrics,range(len(rowMetrics))] - liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True) + :param lldMatrix: List of lists [[value,value,value,value],[value,value,value,value]]. + :type: List of lists List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample. + :param lsSampleNames: List of sample names positionally related (the same) to each list (Optional). + :type: List of strings List of strings. + :param iTopAmount: The amount of top measured samples (assumes the higher measurements are better). + :type: integer Integer amount of sample names/ indices to return. + :return List: List of samples to be selected. + """ + topRankListRet = [] + for rowMetrics in lldMatrix: + # Create 2 d array to hold value and index and sort + liIndexX = [rowMetrics, range(len(rowMetrics))] + liIndexX[1].sort(key=liIndexX[0].__getitem__, reverse=True) + + if lsSampleNames: + topRankListRet.append([lsSampleNames[iIndex] + for iIndex in liIndexX[1][:iTopAmount]]) + else: + topRankListRet.append(liIndexX[1][:iTopAmount]) + + return topRankListRet - if lsSampleNames: - topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]]) - else: - topRankListRet.append(liIndexX[1][:iTopAmount]) + # Group 2## Representative Dissimilarity + # Testing: Happy path tested 1 + def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): + """ + Gets centroid samples by k-medoids clustering of a given matrix. + + :param npaMatrix: Numpy array where row=features and columns=samples + :type: Numpy array Abundance Data. + :param sMetric: String name of beta metric used as the distance metric. + :type: String String name of beta metric. + :param lsSampleNames: The names of the sample + :type: List List of strings + :param iNumberSamplesReturned: Number of samples to return, each will be a centroid of a sample. + :type: Integer Number of samples to return + :return List: List of selected samples. + :param istmBetaMatrix: File with beta-diversity matrix + :type: File stream or file path string + """ - return topRankListRet - - ####Group 2## Representative Dissimilarity - #Testing: Happy path tested 1 - def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): - """ - Gets centroid samples by k-medoids clustering of a given matrix. - - :param npaMatrix: Numpy array where row=features and columns=samples - :type: Numpy array Abundance Data. - :param sMetric: String name of beta metric used as the distance metric. - :type: String String name of beta metric. - :param lsSampleNames: The names of the sample - :type: List List of strings - :param iNumberSamplesReturned: Number of samples to return, each will be a centroid of a sample. - :type: Integer Number of samples to return - :return List: List of selected samples. - :param istmBetaMatrix: File with beta-diversity matrix - :type: File stream or file path string - """ + # Count of how many rows + sampleCount = npaMatrix.shape[0] + if iNumberSamplesReturned > sampleCount: + logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = " + + str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".") + return False + + # If the cluster count is equal to the sample count return all samples + if sampleCount == iNumberSamplesReturned: + return list(lsSampleNames) - #Count of how many rows - sampleCount = npaMatrix.shape[0] - if iNumberSamplesReturned > sampleCount: - logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".") - return False + # Get distance matrix + distanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix, lsSampleOrder=lsSampleNames)[ + 0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames) + if type(distanceMatrix) is BooleanType: + logging.error( + "MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.") + return False - #If the cluster count is equal to the sample count return all samples - if sampleCount == iNumberSamplesReturned: - return list(lsSampleNames) + # Handle unifrac output + if sMetric in [Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted]: + distanceMatrix = distanceMatrix[0] + + # Log distance matrix + logging.debug( + "MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric)) + + distance = MLPYDistanceAdaptor( + npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True) - #Get distance matrix - distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames) - if type(distanceMatrix) is BooleanType: - logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.") - return False + # Create object to determine clusters/medoids + medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance) + # medoidsData includes(1d numpy array, medoids indexes; + # 1d numpy array, non-medoids indexes; + # 1d numpy array, cluster membership for non-medoids; + # double, cost of configuration) + # npaMatrix is samples x rows + # Build a matrix of lists of indicies to pass to the distance matrix + lliIndicesMatrix = [[iIndexPosition] + for iIndexPosition in xrange(0, len(npaMatrix))] + medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix)) + logging.debug( + "MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:") + logging.debug(str(medoidsData)) + + # If returning the same amount of clusters and samples + # Return centroids + selectedIndexes = medoidsData[0] + return [lsSampleNames[selectedIndexes[index]] for index in xrange(0, iNumberSamplesReturned)] + + # Group 3## Highest Dissimilarity + # Testing: Happy path tested + def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): + """ + Select extreme samples from HClustering. + + :param strBetaMetric: The beta metric to use for distance matrix generation. + :type: String The name of the beta metric to use. + :param npaAbundanceMatrix: Numpy array where row=samples and columns=features. + :type: Numpy Array Abundance data. + :param lsSampleNames: The names of the sample. + :type: List List of strings. + :param iSelectSampleCount: Number of samples to select (return). + :type: Integer Integer number of samples returned. + :return Samples: List of samples. + :param istmBetaMatrix: File with beta-diversity matrix + :type: File stream or file path string + """ + + # If they want all the sample count, return all sample names + iSampleCount = len(npaAbundanceMatrix[:, 0]) + if iSelectSampleCount == iSampleCount: + return lsSampleNames + + # Holds the samples to be returned + lsReturnSamplesRet = [] + + # Generate beta matrix + # Returns condensed matrix + tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix, lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric( + npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse=True) - # Handle unifrac output - if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: - distanceMatrix = distanceMatrix[0] - - #Log distance matrix - logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric)) - - distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True) - - #Create object to determine clusters/medoids - medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance) - #medoidsData includes(1d numpy array, medoids indexes; - # 1d numpy array, non-medoids indexes; - # 1d numpy array, cluster membership for non-medoids; - # double, cost of configuration) - #npaMatrix is samples x rows - #Build a matrix of lists of indicies to pass to the distance matrix - lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))] - medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix)) - logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:") - logging.debug(str(medoidsData)) - - #If returning the same amount of clusters and samples - #Return centroids - selectedIndexes = medoidsData[0] - return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)] - - ####Group 3## Highest Dissimilarity - #Testing: Happy path tested - def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): - """ - Select extreme samples from HClustering. - - :param strBetaMetric: The beta metric to use for distance matrix generation. - :type: String The name of the beta metric to use. - :param npaAbundanceMatrix: Numpy array where row=samples and columns=features. - :type: Numpy Array Abundance data. - :param lsSampleNames: The names of the sample. - :type: List List of strings. - :param iSelectSampleCount: Number of samples to select (return). - :type: Integer Integer number of samples returned. - :return Samples: List of samples. - :param istmBetaMatrix: File with beta-diversity matrix - :type: File stream or file path string - """ - - #If they want all the sample count, return all sample names - iSampleCount=len(npaAbundanceMatrix[:,0]) - if iSelectSampleCount==iSampleCount: - return lsSampleNames - - #Holds the samples to be returned - lsReturnSamplesRet = [] - - #Generate beta matrix - #Returns condensed matrix - tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True) + if strBetaMetric in [Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted]: + tempDistanceMatrix = tempDistanceMatrix[0] + + if type(tempDistanceMatrix) is BooleanType: + logging.error( + "MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.") + return False + + if istmBetaMatrix: + tempDistanceMatrix = 1-tempDistanceMatrix + + # Feed beta matrix to linkage to cluster + # Send condensed matrix + linkageMatrix = hcluster.linkage( + tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod) - if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: - tempDistanceMatrix = tempDistanceMatrix[0] + # Extract cluster information from dendrogram + # The linakge matrix is of the form + # [[int1 int2 doube int3],...] + # int1 and int1 are the paired samples indexed at 0 and up. + # each list is an entry for a branch that is number starting with the first + # list being sample count index + 1 + # each list is then named by an increment as they appear + # this means that if a number is in the list and is = sample count or greater it is not + # terminal and is instead a branch. + # This method just takes the lowest metric measurement (highest distance pairs/clusters) + # Works much better than the original technique + # get total number of samples - if type(tempDistanceMatrix) is BooleanType: - logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.") - return False + iCurrentSelectCount = 0 + for row in linkageMatrix: + # Get nodes ofthe lowest pairing (so the furthest apart pair) + iNode1 = int(row[0]) + iNode2 = int(row[1]) + # Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram + # The branching in the dendrogram will start at the number of samples and increment higher. + # Add each of the pair one at a time breaking when enough samples are selected. + if iNode1 < iSampleCount: + lsReturnSamplesRet.append(lsSampleNames[iNode1]) + iCurrentSelectCount = iCurrentSelectCount + 1 + if iCurrentSelectCount == iSelectSampleCount: + break + if iNode2 < iSampleCount: + lsReturnSamplesRet.append(lsSampleNames[iNode2]) + iCurrentSelectCount = iCurrentSelectCount + 1 + if iCurrentSelectCount == iSelectSampleCount: + break - if istmBetaMatrix: - tempDistanceMatrix = 1-tempDistanceMatrix + # Return selected samples + return lsReturnSamplesRet + + # Group 4## Rank Average of user Defined Taxa + # Testing: Happy Path Tested + def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False): + """ + Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped. - #Feed beta matrix to linkage to cluster - #Send condensed matrix - linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod) - - #Extract cluster information from dendrogram - #The linakge matrix is of the form - #[[int1 int2 doube int3],...] - #int1 and int1 are the paired samples indexed at 0 and up. - #each list is an entry for a branch that is number starting with the first - #list being sample count index + 1 - #each list is then named by an increment as they appear - #this means that if a number is in the list and is = sample count or greater it is not - #terminal and is instead a branch. - #This method just takes the lowest metric measurement (highest distance pairs/clusters) - #Works much better than the original technique - #get total number of samples - - iCurrentSelectCount = 0 - for row in linkageMatrix: - #Get nodes ofthe lowest pairing (so the furthest apart pair) - iNode1 = int(row[0]) - iNode2 = int(row[1]) - #Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram - #The branching in the dendrogram will start at the number of samples and increment higher. - #Add each of the pair one at a time breaking when enough samples are selected. - if iNode1<iSampleCount: - lsReturnSamplesRet.append(lsSampleNames[iNode1]) - iCurrentSelectCount = iCurrentSelectCount + 1 - if iCurrentSelectCount == iSelectSampleCount: - break - if iNode2<iSampleCount: - lsReturnSamplesRet.append(lsSampleNames[iNode2]) - iCurrentSelectCount = iCurrentSelectCount + 1 - if iCurrentSelectCount == iSelectSampleCount: - break - - #Return selected samples - return lsReturnSamplesRet - - ####Group 4## Rank Average of user Defined Taxa - #Testing: Happy Path Tested - def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False): - """ - Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped. - - :param abndTable: Abundance Table to analyse - :type: AbundanceTable Abundance Table - :param lsTargetedFeature: String names - :type: list list of string names of features (bugs) which are measured after ranking against the full sample - :param fRank: Indicates to rank the abundance before getting the average abundance of the features (default false) - :type: boolean Flag indicating ranking abundance before calculating average feature measurement (false= no ranking) - :return List of lists or boolean: List of lists or False on error. One internal list per sample indicating the sample, - feature average abundance or ranked abundance. Lists will already be sorted. - For not Ranked [[sample,average abundance of selected feature,1]] - For Ranked [[sample,average ranked abundance, average abundance of selected feature]] - Error Returns false - """ - - llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature) - if not llAbundance: - logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") - return False - #Add a space for ranking if needed - #Not ranked will be [[sSample,average abundance,1]] - #(where 1 will not discriminant ties if used in later functions, so this generalizes) - #Ranked will be [[sSample, average rank, average abundance]] - llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance] - #Rank if needed - if fRank: - abndRanked = abndTable.funcRankAbundance() - if abndRanked == None: - logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.") - return False - llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature) - if not llRetRank: - logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") - return False - dictRanks = dict(llRetRank) - llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance] - - #Sort first for ties and then for the main feature - if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity: - llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank) - if fRank: - llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank) - return llRetAbundance - - #Testing: Happy Path Tested - def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]): - """ - Selects samples with the highest ranks or abundance of targeted features. - If ranked, select the highest abundance for tie breaking - - :param abndMatrix: Abundance table to analyse - :type: AbundanceTable Abundance table - :param lsTargetedTaxa: List of features - :type: list list of strings - :param iSampleSelectionCount: Number of samples to select - :type: integer integer - :param sMethod: Method to select targeted features - :type: string String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues) - :return List of strings: List of sample names which were selected - List of strings Empty list is returned on an error. - """ - - #Check data - if(len(lsTargetedTaxa) < 1): - logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.") - return [] + :param abndTable: Abundance Table to analyse + :type: AbundanceTable Abundance Table + :param lsTargetedFeature: String names + :type: list list of string names of features (bugs) which are measured after ranking against the full sample + :param fRank: Indicates to rank the abundance before getting the average abundance of the features (default false) + :type: boolean Flag indicating ranking abundance before calculating average feature measurement (false= no ranking) + :return List of lists or boolean: List of lists or False on error. One internal list per sample indicating the sample, + feature average abundance or ranked abundance. Lists will already be sorted. + For not Ranked [[sample,average abundance of selected feature,1]] + For Ranked [[sample,average ranked abundance, average abundance of selected feature]] + Error Returns false + """ + + llAbundance = abndTable.funcGetAverageAbundancePerSample( + lsTargetedFeature) + if not llAbundance: + logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") + return False + # Add a space for ranking if needed + # Not ranked will be [[sSample,average abundance,1]] + # (where 1 will not discriminant ties if used in later functions, so this generalizes) + # Ranked will be [[sSample, average rank, average abundance]] + llRetAbundance = [[llist[0], -1, llist[1]] for llist in llAbundance] + # Rank if needed + if fRank: + abndRanked = abndTable.funcRankAbundance() + if abndRanked == None: + logging.error( + "MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.") + return False + llRetRank = abndRanked.funcGetAverageAbundancePerSample( + lsTargetedFeature) + if not llRetRank: + logging.error( + "MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") + return False + dictRanks = dict(llRetRank) + llRetAbundance = [[a[0], dictRanks[a[0]], a[2]] + for a in llRetAbundance] + + # Sort first for ties and then for the main feature + if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity: + llRetAbundance = sorted( + llRetAbundance, key=lambda sampleData: sampleData[2], reverse=not fRank) + if fRank: + llRetAbundance = sorted( + llRetAbundance, key=lambda sampleData: sampleData[1], reverse=not fRank) + return llRetAbundance + + # Testing: Happy Path Tested + def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod=ConstantsMicropita.lsTargetedFeatureMethodValues[0]): + """ + Selects samples with the highest ranks or abundance of targeted features. + If ranked, select the highest abundance for tie breaking + + :param abndMatrix: Abundance table to analyse + :type: AbundanceTable Abundance table + :param lsTargetedTaxa: List of features + :type: list list of strings + :param iSampleSelectionCount: Number of samples to select + :type: integer integer + :param sMethod: Method to select targeted features + :type: string String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues) + :return List of strings: List of sample names which were selected + List of strings Empty list is returned on an error. + """ + + # Check data + if(len(lsTargetedTaxa) < 1): + logging.error( + "MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.") + return [] + + lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa, + fRank=sMethod.lower() == self.c_strTargetedRanked.lower()) + # If an error occured or the key word for the method was not recognized + if lsTargetedSamples == False: + logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.") + return [] + + # Select from results + return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]] + + # Group 5## Random + # Testing: Happy path Tested + def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0): + """ + Returns random sample names of the number given. No replacement. + + :param lsSamples: List of sample names + :type: list list of strings + :param iNumberOfSamplesToReturn: Number of samples to select + :type: integer integer. + :return List: List of selected samples (strings). + """ + + # Input matrix sample count + sampleCount = len(lsSamples) + + # Return the full matrix if they ask for a return matrix where length == original + if(iNumberOfSamplesToReturn >= sampleCount): + return lsSamples + + # Get the random indices for the sample (without replacement) + liRandomIndices = random.sample( + range(sampleCount), iNumberOfSamplesToReturn) + + # Create a boolean array of if indexes are to be included in the reduced array + return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices] - lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa, - fRank=sMethod.lower() == self.c_strTargetedRanked.lower()) - #If an error occured or the key word for the method was not recognized - if lsTargetedSamples == False: - logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.") - return [] - - #Select from results - return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]] - - ####Group 5## Random - #Testing: Happy path Tested - def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0): - """ - Returns random sample names of the number given. No replacement. - - :param lsSamples: List of sample names - :type: list list of strings - :param iNumberOfSamplesToReturn: Number of samples to select - :type: integer integer. - :return List: List of selected samples (strings). - """ + # Happy path tested (case 3) + def funcGetAveragePopulation(self, abndTable, lfCompress): + """ + Get the average row per column in the abndtable. - #Input matrix sample count - sampleCount = len(lsSamples) + :param abndTable: AbundanceTable of data to be averaged + :type: AbudanceTable + :param lfCompress: List of boolean flags (false means to remove sample before averaging + :type: List of floats + :return List of doubles: + """ + if sum(lfCompress) == 0: + return [] + + # Get the average populations + lAverageRet = [] + + for sFeature in abndTable.funcGetAbundanceCopy(): + sFeature = list(sFeature)[1:] + sFeature = np.compress(lfCompress, sFeature, axis=0) + lAverageRet.append(sum(sFeature)/float(len(sFeature))) + return lAverageRet - #Return the full matrix if they ask for a return matrix where length == original - if(iNumberOfSamplesToReturn >= sampleCount): - return lsSamples - - #Get the random indices for the sample (without replacement) - liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn) - - #Create a boolean array of if indexes are to be included in the reduced array - return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices] - - #Happy path tested (case 3) - def funcGetAveragePopulation(self, abndTable, lfCompress): - """ - Get the average row per column in the abndtable. + # Happy path tested (2 cases) + def funcGetDistanceFromAverage(self, abndTable, ldAverage, lsSamples, lfSelected): + """ + Given an abundance table and an average sample, this returns the distance of each sample + (measured using brays-curtis dissimilarity) from the average. + The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected + (which is associated with the samples in the order of the samples in the abundance table; + use abundancetable.funcGetSampleNames() to see the order if needed). - :param abndTable: AbundanceTable of data to be averaged - :type: AbudanceTable - :param lfCompress: List of boolean flags (false means to remove sample before averaging - :type: List of floats - :return List of doubles: - """ - if sum(lfCompress) == 0: - return [] - - #Get the average populations - lAverageRet = [] + :param abndTable: Abundance table holding the data to be analyzed. + :type: AbundanceTable + :param ldAverage: Average population (Average features of the abundance table of samples) + :type: List of doubles which represent the average population + :param lsSamples: These are the only samples used in the analysis + :type: List of strings (sample ids) + :param lfSelected: Samples to be included in the analysis + :type: List of boolean (true means include) + :return: List of distances (doubles) + """ + # Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists + ldSelectedDistances = [] - for sFeature in abndTable.funcGetAbundanceCopy(): - sFeature = list(sFeature)[1:] - sFeature=np.compress(lfCompress,sFeature,axis=0) - lAverageRet.append(sum(sFeature)/float(len(sFeature))) - return lAverageRet - - #Happy path tested (2 cases) - def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected): - """ - Given an abundance table and an average sample, this returns the distance of each sample - (measured using brays-curtis dissimilarity) from the average. - The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected - (which is associated with the samples in the order of the samples in the abundance table; - use abundancetable.funcGetSampleNames() to see the order if needed). + for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]: + # Get the sample measurements + ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity( + np.array([abndTable.funcGetSample(sSampleName), ldAverage]))[0]) + return ldSelectedDistances - :param abndTable: Abundance table holding the data to be analyzed. - :type: AbundanceTable - :param ldAverage: Average population (Average features of the abundance table of samples) - :type: List of doubles which represent the average population - :param lsSamples: These are the only samples used in the analysis - :type: List of strings (sample ids) - :param lfSelected: Samples to be included in the analysis - :type: List of boolean (true means include) - :return: List of distances (doubles) - """ - #Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists - ldSelectedDistances = [] + # Happy path tested (1 case) + def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther): + """ + Get the distance of samples from one label from the average sample of not the label. + Note: This assumes 2 classes. - for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]: - #Get the sample measurements - ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0]) - return ldSelectedDistances - - #Happy path tested (1 case) - def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther): - """ - Get the distance of samples from one label from the average sample of not the label. - Note: This assumes 2 classes. + :param abndTable: Table of data to work out of. + :type: Abundace Table + :param lfGroupOfInterest: Boolean indicator of the sample being in the first group. + :type: List of floats, true indicating an individual in the group of interest. + :param lfGroupOther: Boolean indicator of the sample being in the other group. + :type: List of floats, true indicating an individual in the + :return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population] + """ + # Get all sample names + lsAllSamples = abndTable.funcGetSampleNames() - :param abndTable: Table of data to work out of. - :type: Abundace Table - :param lfGroupOfInterest: Boolean indicator of the sample being in the first group. - :type: List of floats, true indicating an individual in the group of interest. - :param lfGroupOther: Boolean indicator of the sample being in the other group. - :type: List of floats, true indicating an individual in the - :return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population] - """ - #Get all sample names - lsAllSamples = abndTable.funcGetSampleNames() + # Get average populations + lAverageOther = self.funcGetAveragePopulation( + abndTable=abndTable, lfCompress=lfGroupOther) + + # Get the distance from the average of the other label (label 1) + ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther, + lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest) + + return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup], ldSelectedDistances) - #Get average populations - lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther) - - #Get the distance from the average of the other label (label 1) - ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther, - lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest) + # Happy path tested (1 test case) + def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest): + """ + Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group. + An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group. - return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances) - - #Happy path tested (1 test case) - def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest): - """ - Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group. - An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group. + :params abndTable: Abundance of measurements + :type: AbundanceTable + :params iSelectionCount: The number of samples selected per sample. + :type: Integer Integer greater than 0 + :params sLabel: ID of the metadata which is the supervised label + :type: String + :params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest. + :type: String found in the abundance table metadata row indicated by sLabel. + :return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]] + """ - :params abndTable: Abundance of measurements - :type: AbundanceTable - :params iSelectionCount: The number of samples selected per sample. - :type: Integer Integer greater than 0 - :params sLabel: ID of the metadata which is the supervised label - :type: String - :params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest. - :type: String found in the abundance table metadata row indicated by sLabel. - :return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]] - """ + lsMetadata = abndTable.funcGetMetadata(sLabel) + # Other metadata values + lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest)) + + # Get boolean indicator of values of interest + lfLabelsInterested = [sValueOfInterest == + sValue for sValue in lsMetadata] - lsMetadata = abndTable.funcGetMetadata(sLabel) - #Other metadata values - lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest)) + # Get the distances of the items of interest from the other metadata values + dictDistanceAverages = {} + for sOtherLabel in lsUniqueOtherValues: + # Get boolean indicator of labels not of interest + lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata] - #Get boolean indicator of values of interest - lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata] - - #Get the distances of the items of interest from the other metadata values - dictDistanceAverages = {} - for sOtherLabel in lsUniqueOtherValues: - #Get boolean indicator of labels not of interest - lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata] + # Get the distances of data from two different groups to the average of the other + ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel( + abndTable, lfLabelsInterested, lfLabelsOther)) - #Get the distances of data from two different groups to the average of the other - ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther)) + for sKey in ldValueDistances: + dictDistanceAverages[sKey] = ldValueDistances[sKey] + \ + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey] - for sKey in ldValueDistances: - dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey] + # Finish average by dividing by length of lsUniqueOtherValues + ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float( + len(lsUniqueOtherValues))) for sKey in dictDistanceAverages] - #Finish average by dividing by length of lsUniqueOtherValues - ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages] + # Sort to extract extremes + ltpleAverageDistances = sorted( + ltpleAverageDistances, key=operator.itemgetter(1)) - #Sort to extract extremes - ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1)) + # Get the closest and farthest distances + ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount] + ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:] - #Get the closest and farthest distances - ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount] - ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:] + # Remove the selected samples from the larger population of distances (better visualization) + ldSelected = [tpleSelected[0] + for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples] - #Remove the selected samples from the larger population of distances (better visualization) - ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples] + # Return discriminant tuples, distinct tuples, other tuples + return [ltupleDiscriminantSamples, ltupleDistinctSamples, + [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]] - #Return discriminant tuples, distinct tuples, other tuples - return [ltupleDiscriminantSamples, ltupleDistinctSamples, - [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]] - - #Run the supervised method surrounding distance from centroids - #Happy path tested (3 test cases) - def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant, - xOutputSupFile, xPredictSupFile, strSupervisedMetadata, - iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False): - """ - Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group. + # Run the supervised method surrounding distance from centroids + # Happy path tested (3 test cases) + def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant, + xOutputSupFile, xPredictSupFile, strSupervisedMetadata, + iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles=False): + """ + Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group. - :param abundanceTable: AbundanceTable - :type: AbudanceTable Data to analyze - :param fRunDistinct: Run distinct selection method - :type: Boolean boolean (true runs method) - :param fRunDiscriminant: Run discriminant method - :type: Boolean boolean (true runs method) - :param xOutputSupFile: File output from supervised methods detailing data going into the method. - :type: String or FileStream - :param xPredictSupFile: File output from supervised methods distance results from supervised methods. - :type: String or FileStream - :param strSupervisedMetadata: The metadata that will be used to group samples. - :type: String - :param iSampleSupSelectionCount: Number of samples to select - :type: Integer int sample selection count - :param lsOriginalSampleNames: List of the sample names, order is important and should be preserved from the abundanceTable. - :type: List of samples - :param fAppendFiles: Indicates that output files already exist and appending is occuring. - :type: Boolean - :return Selected Samples: A dictionary of selected samples by selection ID - Dictionary {"Selection Method":["SampleID","SampleID"...]} - """ - #Get labels and run one label against many - lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata) - dictlltpleDistanceMeasurements = {} - for sMetadataValue in set(lstrMetadata): + :param abundanceTable: AbundanceTable + :type: AbudanceTable Data to analyze + :param fRunDistinct: Run distinct selection method + :type: Boolean boolean (true runs method) + :param fRunDiscriminant: Run discriminant method + :type: Boolean boolean (true runs method) + :param xOutputSupFile: File output from supervised methods detailing data going into the method. + :type: String or FileStream + :param xPredictSupFile: File output from supervised methods distance results from supervised methods. + :type: String or FileStream + :param strSupervisedMetadata: The metadata that will be used to group samples. + :type: String + :param iSampleSupSelectionCount: Number of samples to select + :type: Integer int sample selection count + :param lsOriginalSampleNames: List of the sample names, order is important and should be preserved from the abundanceTable. + :type: List of samples + :param fAppendFiles: Indicates that output files already exist and appending is occuring. + :type: Boolean + :return Selected Samples: A dictionary of selected samples by selection ID + Dictionary {"Selection Method":["SampleID","SampleID"...]} + """ + # Get labels and run one label against many + lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata) + dictlltpleDistanceMeasurements = {} + for sMetadataValue in set(lstrMetadata): + + # For now perform the selection here for the label of interest against the other labels + dictlltpleDistanceMeasurements.setdefault(sMetadataValue, []).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable, + iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue)) - #For now perform the selection here for the label of interest against the other labels - dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable, - iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue)) + # Make expected output files for supervised methods + # 1. Output file which is similar to an input file for SVMs + # 2. Output file that is similar to the probabilitic output of a SVM (LibSVM) + # Manly for making output of supervised methods (Distance from Centroid) similar + # MicropitaVis needs some of these files + if xOutputSupFile: + if fAppendFiles: + SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, + lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) + else: + SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, + sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) - #Make expected output files for supervised methods - #1. Output file which is similar to an input file for SVMs - #2. Output file that is similar to the probabilitic output of a SVM (LibSVM) - #Manly for making output of supervised methods (Distance from Centroid) similar - #MicropitaVis needs some of these files - if xOutputSupFile: - if fAppendFiles: - SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, - lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) - else: - SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, - sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) + # Will contain the samples selected to return + # One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type + dictSelectedSamplesRet = dict() + for sKey, ltplDistances in dictlltpleDistanceMeasurements.items(): + if fRunDistinct: + dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct, []).extend([ + ltple[0] for ltple in ltplDistances[1]]) + if fRunDiscriminant: + dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant, []).extend([ + ltple[0] for ltple in ltplDistances[0]]) - #Will contain the samples selected to return - #One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type - dictSelectedSamplesRet = dict() - for sKey, ltplDistances in dictlltpleDistanceMeasurements.items(): - if fRunDistinct: - dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]]) - if fRunDiscriminant: - dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]]) + if xPredictSupFile: + dictFlattenedDistances = dict() + [dictFlattenedDistances.setdefault(sKey, []).append(tple) + for sKey, lltple in dictlltpleDistanceMeasurements.items() + for ltple in lltple for tple in ltple] + if fAppendFiles: + self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, + dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) + else: + self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, + dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) + return dictSelectedSamplesRet - if xPredictSupFile: - dictFlattenedDistances = dict() - [dictFlattenedDistances.setdefault(sKey, []).append(tple) - for sKey, lltple in dictlltpleDistanceMeasurements.items() - for ltple in lltple for tple in ltple] - if fAppendFiles: - self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, - dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) - else: - self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, - dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) - return dictSelectedSamplesRet + # Two happy path test cases + def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames): + """ + Manages updating the predict file. - #Two happy path test cases - def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames): - """ - Manages updating the predict file. + :param xPredictSupFile: File that has predictions (distances) from the supervised method. + :type: FileStream or String file path + :param xInputLabelsFile: File that as input to the supervised methods. + :type: FileStream or String file path + :param dictltpleDistanceMeasurements: + :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} + """ + + if not isinstance(xPredictSupFile, str): + xPredictSupFile.close() + xPredictSupFile = xPredictSupFile.name + csvr = open(xPredictSupFile, 'r') + + f = csv.reader( + csvr, delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) + lsHeader = f.next()[1:] + dictlltpleRead = dict([(sHeader, []) for sHeader in lsHeader]) - :param xPredictSupFile: File that has predictions (distances) from the supervised method. - :type: FileStream or String file path - :param xInputLabelsFile: File that as input to the supervised methods. - :type: FileStream or String file path - :param dictltpleDistanceMeasurements: - :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} - """ + # Read data in + iSampleIndex = 0 + for sRow in f: + sLabel = sRow[0] + [dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex], dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:]) + if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue] + iSampleIndex += 1 - if not isinstance(xPredictSupFile, str): - xPredictSupFile.close() - xPredictSupFile = xPredictSupFile.name - csvr = open(xPredictSupFile,'r') + # Combine dictltpleDistanceMeasurements with new data + # If they share a key then merge keeping parameter data + # If they do not share the key, keep the full data + dictNew = {} + for sKey in dictltpleDistanceMeasurements.keys(): + lsSamples = [tple[0] + for tple in dictltpleDistanceMeasurements[sKey]] + dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] + not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey] + for sKey in dictlltpleRead: + if sKey not in dictltpleDistanceMeasurements.keys(): + dictNew[sKey] = dictlltpleRead[sKey] - f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) - lsHeader = f.next()[1:] - dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader]) - - #Read data in - iSampleIndex = 0 - for sRow in f: - sLabel = sRow[0] - [dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:]) - if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue] - iSampleIndex += 1 + # Call writer + self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile, + dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable, + lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True) - #Combine dictltpleDistanceMeasurements with new data - #If they share a key then merge keeping parameter data - #If they do not share the key, keep the full data - dictNew = {} - for sKey in dictltpleDistanceMeasurements.keys(): - lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]] - dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey] - for sKey in dictlltpleRead: - if sKey not in dictltpleDistanceMeasurements.keys(): - dictNew[sKey] = dictlltpleRead[sKey] + # 2 happy path test cases + def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False): + """ + Write to the predict file. - #Call writer - self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile, - dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable, - lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True) + :param xPredictSupFile: File that has predictions (distances) from the supervised method. + :type: FileStream or String file path + :param xInputLabelsFile: File that as input to the supervised methods. + :type: FileStream or String file path + :param dictltpleDistanceMeasurements: + :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} + :param abundanceTable: An abundance table of the sample data. + :type: AbundanceTable + :param lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing. + Otherwise will use the sample names from the abundance table. + :type: List of strings + :param fFromUpdate: Indicates if this is part of an update to the file or not. + :type: Boolean + """ - #2 happy path test cases - def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False): - """ - Write to the predict file. + xInputLabelsFileName = xInputLabelsFile + if not isinstance(xInputLabelsFile, str): + xInputLabelsFileName = xInputLabelsFile.name + f = csv.writer(open(xPredictSupFile, "w") if isinstance(xPredictSupFile, str) + else xPredictSupFile, delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) - :param xPredictSupFile: File that has predictions (distances) from the supervised method. - :type: FileStream or String file path - :param xInputLabelsFile: File that as input to the supervised methods. - :type: FileStream or String file path - :param dictltpleDistanceMeasurements: - :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} - :param abundanceTable: An abundance table of the sample data. - :type: AbundanceTable - :param lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing. - Otherwise will use the sample names from the abundance table. - :type: List of strings - :param fFromUpdate: Indicates if this is part of an update to the file or not. - :type: Boolean - """ + lsAllSampleNames = abundanceTable.funcGetSampleNames() + lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames=lsOriginalSampleNames if fFromUpdate else lsAllSampleNames, + isPredictFile=False) + dictLabels = dict([(sSample, sLabel) for sLabel in lsLabels.keys() + for sSample in lsLabels[sLabel]]) - xInputLabelsFileName = xInputLabelsFile - if not isinstance(xInputLabelsFile,str): - xInputLabelsFileName = xInputLabelsFile.name - f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) + # Dictionay keys will be used to order the predict file + lsMeasurementKeys = dictltpleDistanceMeasurements.keys() + # Make header + f.writerow(["labels"]+lsMeasurementKeys) - lsAllSampleNames = abundanceTable.funcGetSampleNames() - lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames, - isPredictFile=False) - dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]]) - - #Dictionay keys will be used to order the predict file - lsMeasurementKeys = dictltpleDistanceMeasurements.keys() - #Make header - f.writerow(["labels"]+lsMeasurementKeys) + # Reformat dictionary to make it easier to use + for sKey in dictltpleDistanceMeasurements: + dictltpleDistanceMeasurements[sKey] = dict( + [ltpl for ltpl in dictltpleDistanceMeasurements[sKey]]) - #Reformat dictionary to make it easier to use - for sKey in dictltpleDistanceMeasurements: - dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]]) + for sSample in lsOriginalSampleNames: + # Make body of file + f.writerow([dictLabels.get(sSample, ConstantsMicropita.c_sEmptyPredictFileValue)] + + [str(dictltpleDistanceMeasurements[sKey].get(sSample, ConstantsMicropita.c_sEmptyPredictFileValue)) + for sKey in lsMeasurementKeys]) - for sSample in lsOriginalSampleNames: - #Make body of file - f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+ - [str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)) - for sKey in lsMeasurementKeys]) + def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics, + fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None, + istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False): + """ + Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other + for the set that should be normalized. - def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics, - fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None, - istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False): - """ - Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other - for the set that should be normalized. - - :param abndData: Abundance table object holding the samples to be measured. - :type: AbundanceTable - :param iSampleSelectionCount The number of samples to select per method. - :type: Integer - :param dictSelectedSamples Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}. - :type: Dictionary - :param lsAlphaMetrics: List of alpha metrics to use on alpha metric dependent assays (like highest diversity). - :type: List of strings - :param lsBetaMetrics: List of beta metrics to use on beta metric dependent assays (like most representative). - :type: List of strings - :param lsInverseBetaMetrics: List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar). - :type: List of strings - :param fRunDiversity: Run Diversity based methods (true indicates run). - :type: Boolean - :param fRunRepresentative: Run Representative based methods (true indicates run). - :type: Boolean - :param fRunExtreme: Run Extreme based methods (true indicates run). - :type: Boolean - :param istmBetaMatrix: File that has a precalculated beta matrix - :type: File stream or File path string - :return Selected Samples: Samples selected by methods. - Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} - """ + :param abndData: Abundance table object holding the samples to be measured. + :type: AbundanceTable + :param iSampleSelectionCount The number of samples to select per method. + :type: Integer + :param dictSelectedSamples Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}. + :type: Dictionary + :param lsAlphaMetrics: List of alpha metrics to use on alpha metric dependent assays (like highest diversity). + :type: List of strings + :param lsBetaMetrics: List of beta metrics to use on beta metric dependent assays (like most representative). + :type: List of strings + :param lsInverseBetaMetrics: List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar). + :type: List of strings + :param fRunDiversity: Run Diversity based methods (true indicates run). + :type: Boolean + :param fRunRepresentative: Run Representative based methods (true indicates run). + :type: Boolean + :param fRunExtreme: Run Extreme based methods (true indicates run). + :type: Boolean + :param istmBetaMatrix: File that has a precalculated beta matrix + :type: File stream or File path string + :return Selected Samples: Samples selected by methods. + Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} + """ - #Sample ids/names - lsSampleNames = abndData.funcGetSampleNames() - - #Generate alpha metrics and get most diverse - if fRunDiversity: + # Sample ids/names + lsSampleNames = abndData.funcGetSampleNames() + + # Generate alpha metrics and get most diverse + if fRunDiversity: + + # Get Alpha metrics matrix + internalAlphaMatrix = None + # Name of technique + strMethod = [ + strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics - #Get Alpha metrics matrix - internalAlphaMatrix = None - #Name of technique - strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics + # If given an alpha-diversity metadata + if strAlphaMetadata: + internalAlphaMatrix = [ + [float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]] + else: + # Expects Observations (Taxa (row) x sample (column)) + #Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]] + internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance=abndData.funcGetAbundanceCopy() + if not abndData.funcIsSummed() + else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(), + lsSampleNames=lsSampleNames, lsDiversityMetricAlpha=lsAlphaMetrics) - #If given an alpha-diversity metadata - if strAlphaMetadata: - internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]] - else: - #Expects Observations (Taxa (row) x sample (column)) - #Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]] - internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy() - if not abndData.funcIsSummed() - else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(), - lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics) - - if internalAlphaMatrix: - #Invert measurments - if fInvertDiversity: - lldNewDiversity = [] - for lsLine in internalAlphaMatrix: - lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine]) - internalAlphaMatrix = lldNewDiversity - #Get top ranked alpha diversity by most diverse - #Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...] - #Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]] - mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount) + if internalAlphaMatrix: + # Invert measurments + if fInvertDiversity: + lldNewDiversity = [] + for lsLine in internalAlphaMatrix: + lldNewDiversity.append( + [1/max(dValue, ConstantsMicropita.c_smallNumber) for dValue in lsLine]) + internalAlphaMatrix = lldNewDiversity + # Get top ranked alpha diversity by most diverse + # Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...] + #Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]] + mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples( + lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount) - #Add to results - for index in xrange(0,len(strMethod)): - strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index]) - dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index]) + # Add to results + for index in xrange(0, len(strMethod)): + strSelectionMethod = self.dictConvertAMetricDiversity.get( + strMethod[index], ConstantsMicropita.c_strDiversity+"="+strMethod[index]) + dictSelectedSamples.setdefault(strSelectionMethod, []).extend( + mostDiverseAlphaSamplesIndexes[index]) + + logging.info( + "MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b") + logging.info(dictSelectedSamples) + + # Generate beta metrics and + if fRunRepresentative or fRunExtreme: - logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b") - logging.info(dictSelectedSamples) - - #Generate beta metrics and - if fRunRepresentative or fRunExtreme: + # Abundance matrix transposed + npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix( + abndData.funcGetAbundanceCopy(), fRemoveAdornments=True) - #Abundance matrix transposed - npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True) - - #Get center selection using clusters/tiling - #This will be for beta metrics in normalized space - if fRunRepresentative: + # Get center selection using clusters/tiling + # This will be for beta metrics in normalized space + if fRunRepresentative: - if istmBetaMatrix: - #Get representative dissimilarity samples - medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) + if istmBetaMatrix: + # Get representative dissimilarity samples + medoidSamples = self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, + iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) - if medoidSamples: - dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples) - else: - logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.") - for bMetric in lsBetaMetrics: + if medoidSamples: + dictSelectedSamples.setdefault( + ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom, []).extend(medoidSamples) + else: + logging.info( + "MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.") + for bMetric in lsBetaMetrics: - #Get representative dissimilarity samples - medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) + # Get representative dissimilarity samples + medoidSamples = self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, + iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) - if medoidSamples: - dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples) + if medoidSamples: + dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get( + bMetric, ConstantsMicropita.c_strRepresentative+"="+bMetric), []).extend(medoidSamples) - #Get extreme selection using clusters, tiling - if fRunExtreme: - logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.") - if istmBetaMatrix: + # Get extreme selection using clusters, tiling + if fRunExtreme: + logging.info( + "MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.") + if istmBetaMatrix: - #Samples for representative dissimilarity - #This involves inverting the distance metric, - #Taking the dendrogram level of where the number cluster == the number of samples to select - #Returning a repersentative sample from each cluster - extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) - - #Add selected samples - if extremeSamples: - dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples) + # Samples for representative dissimilarity + # This involves inverting the distance metric, + # Taking the dendrogram level of where the number cluster == the number of samples to select + # Returning a repersentative sample from each cluster + extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, + lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) - else: - #Run KMedoids with inverse custom distance metric in normalized space - for bMetric in lsInverseBetaMetrics: + # Add selected samples + if extremeSamples: + dictSelectedSamples.setdefault( + ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom, []).extend(extremeSamples) + + else: + # Run KMedoids with inverse custom distance metric in normalized space + for bMetric in lsInverseBetaMetrics: - #Samples for representative dissimilarity - #This involves inverting the distance metric, - #Taking the dendrogram level of where the number cluster == the number of samples to select - #Returning a repersentative sample from each cluster - extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) - - #Add selected samples - if extremeSamples: - dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples) + # Samples for representative dissimilarity + # This involves inverting the distance metric, + # Taking the dendrogram level of where the number cluster == the number of samples to select + # Returning a repersentative sample from each cluster + extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, + iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) + + # Add selected samples + if extremeSamples: + dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get( + bMetric, ConstantsMicropita.c_strExtreme+"="+bMetric), []).extend(extremeSamples) - logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b") - logging.info(dictSelectedSamples) - return dictSelectedSamples + logging.info( + "MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b") + logging.info(dictSelectedSamples) + return dictSelectedSamples - def funcRun(self, strIDName, strLastMetadataName, istmInput, - ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput, - cDelimiter, cFeatureNameDelimiter, strFeatureSelection, - istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None, - strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None, - iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False): - """ - Manages the selection of samples given different metrics. + def funcRun(self, strIDName, strLastMetadataName, istmInput, + ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput, + cDelimiter, cFeatureNameDelimiter, strFeatureSelection, + istmFeatures, iCount, lstrMethods, strLastRowMetadata=None, strLabel=None, strStratify=None, + strCustomAlpha=None, strCustomBeta=None, strAlphaMetadata=None, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, + iMinSeqs=ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples=ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity=False): + """ + Manages the selection of samples given different metrics. - :param strIDName: Sample Id metadata row - :type: String - :param strLastMetadataName: The id of the metadata positioned last in the abundance table. - :type: String String metadata id. - :param istmInput: File to store input data to supervised methods. - :type: FileStream of String file path - :param ostmInputPredictFile: File to store distances from supervised methods. - :type: FileStream or String file path - :param ostmCheckedFile: File to store the AbundanceTable data after it is being checked. - :type: FileStream or String file path - :param ostmOutPut: File to store sample selection by methods of interest. - :type: FileStream or String file path - :param cDelimiter: Delimiter of abundance table. - :type: Character Char (default TAB). - :param cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades). - :type: Character (default |). - :param stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance). - :type: String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues). - :param istmFeatures: File which holds the features of interest if using targeted feature methodology. - :type: FileStream or String file path - :param iCount: Number of samples to select in each methods, supervised methods select this amount per label if possible. - :type: Integer integer. - :param lstrMethods: List of strings indicating selection techniques. - :type: List of string method names - :param strLabel: The metadata used for supervised labels. - :type: String - :param strStratify: The metadata used to stratify unsupervised data. - :type: String - :param strCustomAlpha: Custom alpha diversity metric - :type: String - :param strCustomBeta: Custom beta diversity metric - :type: String - :param strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling - :type: String - :param istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling - :type: FileStream or String file path - :param istrmTree: File containing tree for phylogentic beta-diversity analysis - :type: FileStream or String file path - :param istrmEnvr: File containing environment for phylogentic beta-diversity analysis - :type: FileStream or String file path - :param iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples. - :type: Integer - :param iMinSamples: Minimum sample count for the occurence filter. - :type: Integer - :param fInvertDiversity: When true will invert diversity measurements before using. - :type: boolean - :return Selected Samples: Samples selected by methods. - Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} - """ + :param strIDName: Sample Id metadata row + :type: String + :param strLastMetadataName: The id of the metadata positioned last in the abundance table. + :type: String String metadata id. + :param istmInput: File to store input data to supervised methods. + :type: FileStream of String file path + :param ostmInputPredictFile: File to store distances from supervised methods. + :type: FileStream or String file path + :param ostmCheckedFile: File to store the AbundanceTable data after it is being checked. + :type: FileStream or String file path + :param ostmOutPut: File to store sample selection by methods of interest. + :type: FileStream or String file path + :param cDelimiter: Delimiter of abundance table. + :type: Character Char (default TAB). + :param cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades). + :type: Character (default |). + :param stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance). + :type: String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues). + :param istmFeatures: File which holds the features of interest if using targeted feature methodology. + :type: FileStream or String file path + :param iCount: Number of samples to select in each methods, supervised methods select this amount per label if possible. + :type: Integer integer. + :param lstrMethods: List of strings indicating selection techniques. + :type: List of string method names + :param strLabel: The metadata used for supervised labels. + :type: String + :param strStratify: The metadata used to stratify unsupervised data. + :type: String + :param strCustomAlpha: Custom alpha diversity metric + :type: String + :param strCustomBeta: Custom beta diversity metric + :type: String + :param strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling + :type: String + :param istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling + :type: FileStream or String file path + :param istrmTree: File containing tree for phylogentic beta-diversity analysis + :type: FileStream or String file path + :param istrmEnvr: File containing environment for phylogentic beta-diversity analysis + :type: FileStream or String file path + :param iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples. + :type: Integer + :param iMinSamples: Minimum sample count for the occurence filter. + :type: Integer + :param fInvertDiversity: When true will invert diversity measurements before using. + :type: boolean + :return Selected Samples: Samples selected by methods. + Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} + """ - #Holds the top ranked samples from different metrics - #dict[metric name] = [samplename,samplename...] - selectedSamples = dict() - - #If a target feature file is given make sure that targeted feature is in the selection methods, if not add - if ConstantsMicropita.c_strFeature in lstrMethods: - if not istmFeatures: - logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.") - return False + # Holds the top ranked samples from different metrics + # dict[metric name] = [samplename,samplename...] + selectedSamples = dict() + + # If a target feature file is given make sure that targeted feature is in the selection methods, if not add + if ConstantsMicropita.c_strFeature in lstrMethods: + if not istmFeatures: + logging.error( + "MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.") + return False - #Diversity metrics to run - #Use custom metrics if specified - #Custom beta metrics set to normalized only, custom alpha metrics set to count only - diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity] - diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity] + # Diversity metrics to run + # Use custom metrics if specified + # Custom beta metrics set to normalized only, custom alpha metrics set to count only + diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [ + MicroPITA.c_strInverseSimpsonDiversity] + diversityMetricsBeta = [] if istmBetaMatrix else [ + strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity] # inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity] - diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else [] - diversityMetricsBetaNoNormalize = [] + diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [ + strCustomAlpha] if strCustomAlpha else [] + diversityMetricsBetaNoNormalize = [] # inverseDiversityMetricsBetaNoNormalize = [] - #Targeted taxa - userDefinedTaxa = [] - - #Perform different flows flags - c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods - c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods - c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods - c_RUN_RANK_AVERAGE_USER_4 = False - if ConstantsMicropita.c_strFeature in lstrMethods: - c_RUN_RANK_AVERAGE_USER_4 = True - if not istmFeatures: - logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.") - return False - #Read in taxa list, break down to lines and filter out empty strings - userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines())) - c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods - c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods - c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods + # Targeted taxa + userDefinedTaxa = [] + + # Perform different flows flags + c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods + c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods + c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods + c_RUN_RANK_AVERAGE_USER_4 = False + if ConstantsMicropita.c_strFeature in lstrMethods: + c_RUN_RANK_AVERAGE_USER_4 = True + if not istmFeatures: + logging.error( + "MicroPITA.funcRun:: No taxa file was given for taxa selection.") + return False + # Read in taxa list, break down to lines and filter out empty strings + userDefinedTaxa = filter(None, (s.strip() + for s in istmFeatures.readlines())) + c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods + c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods + c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods + + # Read in abundance data + # Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0 + # Abundance table object to read in and manage data + totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter=[iMinSeqs, iMinSamples], + cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata, + sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile) + if not totalAbundanceTable: + logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed." + + " This often occurs when the Last Metadata is not specified correctly." + + " Please check to make sure the Last Metadata selection is the row of the last metadata," + + " all values after this selection should be microbial measurements and should be numeric.") + return False + + lsOriginalLabels = SVM.funcMakeLabels( + totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel - #Read in abundance data - #Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0 - #Abundance table object to read in and manage data - totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples], - cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata, - sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile) - if not totalAbundanceTable: - logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+ - " This often occurs when the Last Metadata is not specified correctly."+ - " Please check to make sure the Last Metadata selection is the row of the last metadata,"+ - " all values after this selection should be microbial measurements and should be numeric.") - return False + dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy() + logging.debug("MicroPITA.funcRun:: Received metadata=" + + str(dictTotalMetadata)) + # If there is only 1 unique value for the labels, do not run the Supervised methods + if strLabel and (len(set(dictTotalMetadata.get(strLabel, []))) < 2): + logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + + str(dictTotalMetadata.get(strLabel, []))) + return False + + #Run unsupervised methods### + # Stratify the data if need be and drop the old data + lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata( + strStratify) if strStratify else [totalAbundanceTable] + + # For each stratified abundance block or for the unstratfified abundance + # Run the unsupervised blocks + fAppendSupFiles = False + for stratAbundanceTable in lStratifiedAbundanceTables: + logging.info("MicroPITA.funcRun:: Running abundance block:" + + stratAbundanceTable.funcGetName()) + + # NOT SUMMED, NOT NORMALIZED + # Only perform if the data is not yet normalized + if not stratAbundanceTable.funcIsNormalized(): + # Need to first work with unnormalized data + if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: + + self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, + dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize, + lsBetaMetrics=diversityMetricsBetaNoNormalize, + lsInverseBetaMetrics=diversityMetricsBetaNoNormalize, + fRunDiversity=c_RUN_MAX_DIVERSITY_1, fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, + fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata, + istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) + + # Generate selection by the rank average of user defined taxa + # Expects (Taxa (row) by Samples (column)) + # Expects a column 0 of taxa id that is skipped + # Returns [(sample name,average,rank)] + # SUMMED AND NORMALIZED + stratAbundanceTable.funcSumClades() + # Normalize data at this point + stratAbundanceTable.funcNormalize() + if c_RUN_RANK_AVERAGE_USER_4: + selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable, + lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection) + logging.info("MicroPITA.funcRun:: Selected Samples Rank") + logging.info(selectedSamples) - lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel + # SUMMED AND NORMALIZED analysis block + # Diversity based metric will move reduce to terminal taxa as needed + if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: + + self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, + dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha, + lsBetaMetrics=diversityMetricsBeta, + lsInverseBetaMetrics=diversityMetricsBeta, + fRunDiversity=c_RUN_MAX_DIVERSITY_1, fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, + fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, + istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) + + # 5::Select randomly + # Expects sampleNames = List of sample names [name, name, name...] + if(c_RUN_RANDOM_5): + # Select randomly from sample names + selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples( + lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount) + logging.info("MicroPITA.funcRun:: Selected Samples Random") + logging.info(selectedSamples) - dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy() - logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata)) - #If there is only 1 unique value for the labels, do not run the Supervised methods - if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ): - logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[]))) - return False + # Perform supervised selection + if c_RUN_DISTINCT or c_RUN_DISCRIMINANT: + if strLabel: + dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable, + fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT, + xOutputSupFile=ostmInputPredictFile, xPredictSupFile=ostmPredictFile, + strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount, + lsOriginalSampleNames=totalAbundanceTable.funcGetSampleNames(), + lsOriginalLabels=lsOriginalLabels, + fAppendFiles=fAppendSupFiles) - #Run unsupervised methods### - #Stratify the data if need be and drop the old data - lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable] + [selectedSamples.setdefault(sKey, []).extend( + lValue) for sKey, lValue in dictSelectionRet.items()] + + if not fAppendSupFiles: + fAppendSupFiles = True + logging.info( + "MicroPITA.funcRun:: Selected Samples Unsupervised") + logging.info(selectedSamples) + return selectedSamples - #For each stratified abundance block or for the unstratfified abundance - #Run the unsupervised blocks - fAppendSupFiles = False - for stratAbundanceTable in lStratifiedAbundanceTables: - logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName()) + # Testing: Happy path tested + @staticmethod + def funcWriteSelectionToFile(dictSelection, xOutputFilePath): + """ + Writes the selection of samples by method to an output file. + + :param dictSelection: The dictionary of selections by method to be written to a file. + :type: Dictionary The dictionary of selections by method {"method":["sample selected","sample selected"...]} + :param xOutputFilePath: FileStream or String path to file inwhich the dictionary is written. + :type: String FileStream or String path to file + """ + + if not dictSelection: + return + + # Open file + f = csv.writer(open(xOutputFilePath, "w") if isinstance( + xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim) - ###NOT SUMMED, NOT NORMALIZED - #Only perform if the data is not yet normalized - if not stratAbundanceTable.funcIsNormalized( ): - #Need to first work with unnormalized data - if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: + # Create output content from dictionary + for sKey in dictSelection: + f.writerow([sKey]+dictSelection[sKey]) + logging.debug( + "MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey])) + + # Testing: Happy Path tested + @staticmethod + def funcReadSelectionFileToDictionary(xInputFile): + """ + Reads in an output selection file from micropita and formats it into a dictionary. - self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, - dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize, - lsBetaMetrics=diversityMetricsBetaNoNormalize, - lsInverseBetaMetrics=diversityMetricsBetaNoNormalize, - fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, - fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata, - istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) + :param xInputFile: String path to file or file stream to read and translate into a dictionary. + {"method":["sample selected","sample selected"...]} + :type: FileStream or String Path to file + :return Dictionary: Samples selected by methods. + Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} + """ + + # Open file + istmReader = csv.reader(open(xInputFile, 'r') if isinstance( + xInputFile, str) else xInputFile, delimiter=ConstantsMicropita.c_outputFileDelim) + + # Dictionary to hold selection data + return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader]) - #Generate selection by the rank average of user defined taxa - #Expects (Taxa (row) by Samples (column)) - #Expects a column 0 of taxa id that is skipped - #Returns [(sample name,average,rank)] - #SUMMED AND NORMALIZED - stratAbundanceTable.funcSumClades() - #Normalize data at this point - stratAbundanceTable.funcNormalize() - if c_RUN_RANK_AVERAGE_USER_4: - selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable, - lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection) - logging.info("MicroPITA.funcRun:: Selected Samples Rank") - logging.info(selectedSamples) +# Set up arguments reader +argp = argparse.ArgumentParser(prog="MicroPITA.py", + description="""Selects samples from abundance tables based on various selection schemes.""") - ###SUMMED AND NORMALIZED analysis block - #Diversity based metric will move reduce to terminal taxa as needed - if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: +args = argp.add_argument_group("Common", "Commonly modified options") +args.add_argument(ConstantsMicropita.c_strCountArgument, "--num", dest="iCount", + metavar="samples", default=10, type=int, help=ConstantsMicropita.c_strCountHelp) +args.add_argument("-m", "--method", dest="lstrMethods", metavar="method", default=[], help=ConstantsMicropita.c_strSelectionTechniquesHelp, + choices=ConstantsMicropita.c_lsAllMethods, action="append") - self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, - dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha, - lsBetaMetrics=diversityMetricsBeta, - lsInverseBetaMetrics=diversityMetricsBeta, - fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, - fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, - istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) - - #5::Select randomly - #Expects sampleNames = List of sample names [name, name, name...] - if(c_RUN_RANDOM_5): - #Select randomly from sample names - selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount) - logging.info("MicroPITA.funcRun:: Selected Samples Random") - logging.info(selectedSamples) - - #Perform supervised selection - if c_RUN_DISTINCT or c_RUN_DISCRIMINANT: - if strLabel: - dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable, - fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT, - xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile, - strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount, - lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(), - lsOriginalLabels = lsOriginalLabels, - fAppendFiles=fAppendSupFiles) - - [selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()] +args = argp.add_argument_group( + "Custom", "Selecting and inputing custom metrics") +args.add_argument("-a", "--alpha", dest="strAlphaDiversity", metavar="AlphaDiversity", default=None, + help=ConstantsMicropita.c_strCustomAlphaDiversityHelp, choices=Metric.setAlphaDiversities) +args.add_argument("-b", "--beta", dest="strBetaDiversity", metavar="BetaDiversity", default=None, help=ConstantsMicropita.c_strCustomBetaDiversityHelp, + choices=list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted]) +args.add_argument("-q", "--alphameta", dest="strAlphaMetadata", metavar="AlphaDiversityMetadata", + default=None, help=ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp) +args.add_argument("-x", "--betamatrix", dest="istmBetaMatrix", metavar="BetaDiversityMatrix", + default=None, help=ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp) +args.add_argument("-o", "--tree", dest="istrmTree", metavar="PhylogeneticTree", + default=None, help=ConstantsMicropita.c_strCustomPhylogeneticTreeHelp) +args.add_argument("-i", "--envr", dest="istrmEnvr", metavar="EnvironmentFile", + default=None, help=ConstantsMicropita.c_strCustomEnvironmentFileHelp) +args.add_argument("-f", "--invertDiversity", dest="fInvertDiversity", action="store_true", + default=False, help=ConstantsMicropita.c_strInvertDiversityHelp) - if not fAppendSupFiles: - fAppendSupFiles = True - logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised") - logging.info(selectedSamples) - return selectedSamples - - #Testing: Happy path tested - @staticmethod - def funcWriteSelectionToFile(dictSelection,xOutputFilePath): - """ - Writes the selection of samples by method to an output file. - - :param dictSelection: The dictionary of selections by method to be written to a file. - :type: Dictionary The dictionary of selections by method {"method":["sample selected","sample selected"...]} - :param xOutputFilePath: FileStream or String path to file inwhich the dictionary is written. - :type: String FileStream or String path to file - """ - - if not dictSelection: - return - - #Open file - f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim ) +args = argp.add_argument_group( + "Miscellaneous", "Row/column identifiers and feature targeting options") +args.add_argument("-d", ConstantsMicropita.c_strIDNameArgument, dest="strIDName", + metavar="sample_id", help=ConstantsMicropita.c_strIDNameHelp) +args.add_argument("-l", ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar="metadata_id", default=None, + help=ConstantsMicropita.c_strLastMetadataNameHelp) +args.add_argument("-r", ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0], + choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help=ConstantsMicropita.c_strTargetedFeatureMethodHelp) +args.add_argument("-t", ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", + metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp) +args.add_argument("-w", ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", + metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp) - #Create output content from dictionary - for sKey in dictSelection: - f.writerow([sKey]+dictSelection[sKey]) - logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey])) - - #Testing: Happy Path tested - @staticmethod - def funcReadSelectionFileToDictionary(xInputFile): - """ - Reads in an output selection file from micropita and formats it into a dictionary. - - :param xInputFile: String path to file or file stream to read and translate into a dictionary. - {"method":["sample selected","sample selected"...]} - :type: FileStream or String Path to file - :return Dictionary: Samples selected by methods. - Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} - """ +args = argp.add_argument_group( + "Data labeling", "Metadata IDs for strata and supervised label values") +args.add_argument("-e", ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", + metavar="supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp) +args.add_argument("-s", ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id", + help=ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp) - #Open file - istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim) - - #Dictionary to hold selection data - return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader]) - -#Set up arguments reader -argp = argparse.ArgumentParser( prog = "MicroPITA.py", - description = """Selects samples from abundance tables based on various selection schemes.""" ) +args = argp.add_argument_group( + "File formatting", "Rarely modified file formatting options") +args.add_argument("-j", ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", + metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp) +args.add_argument("-k", ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", + metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp) -args = argp.add_argument_group( "Common", "Commonly modified options" ) -args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp) -args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp, - choices = ConstantsMicropita.c_lsAllMethods, action = "append") +args = argp.add_argument_group( + "Debugging", "Debugging options - modify at your own risk!") +args.add_argument("-v", ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar="log_level", default="WARNING", + choices=ConstantsMicropita.c_lsLoggingChoices, help=ConstantsMicropita.c_strLoggingHelp) +args.add_argument("-c", ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", + metavar="output_qc", type=argparse.FileType("w"), help=ConstantsMicropita.c_strCheckedAbundanceFileHelp) +args.add_argument("-g", ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", + metavar="output_log", type=argparse.FileType("w"), help=ConstantsMicropita.c_strLoggingFileHelp) +args.add_argument("-u", ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", + metavar="output_scaled", type=argparse.FileType("w"), help=ConstantsMicropita.c_strSupervisedInputFileHelp) +args.add_argument("-p", ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", + metavar="output_labels", type=argparse.FileType("w"), help=ConstantsMicropita.c_strSupervisedPredictedFileHelp) -args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" ) -args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp, choices = Metric.setAlphaDiversities) -args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp, choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]) -args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp) -args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp) -args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp) -args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp) -args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp) - -args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" ) -args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp) -args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None, - help= ConstantsMicropita.c_strLastMetadataNameHelp) -args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0], - choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp) -args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp) -args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp) +argp.add_argument("istmInput", metavar="input.pcl/biome", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strAbundanceFileHelp, + default=sys.stdin) +argp.add_argument("ostmOutput", metavar="output.txt", type=argparse.FileType("w"), help=ConstantsMicropita.c_strGenericOutputDataFileHelp, + default=sys.stdout) -args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" ) -args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp) -args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id", - help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp) +__doc__ = "::\n\n\t" + argp.format_help().replace("\n", "\n\t") + __doc__ + -args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" ) -args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp) -args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp) +def _main(): + args = argp.parse_args() -args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" ) -args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING", - choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp) -args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp) -args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp) -args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp) -args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp) + # Set up logger + iLogLevel = getattr(logging, args.strLogLevel.upper(), None) + logging.basicConfig( + stream=args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode='w', level=iLogLevel) -argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp, - default = sys.stdin) -argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp, - default = sys.stdout) + # Run micropita + logging.info("MicroPITA:: Start microPITA") + microPITA = MicroPITA() -__doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__ + # Argparse will append to the default but will not remove the default so I do this here + if not len(args.lstrMethods): + args.lstrMethods = [ConstantsMicropita.c_strRepresentative] -def _main( ): - args = argp.parse_args( ) - - #Set up logger - iLogLevel = getattr(logging, args.strLogLevel.upper(), None) - logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel) - - #Run micropita - logging.info("MicroPITA:: Start microPITA") - microPITA = MicroPITA() - - #Argparse will append to the default but will not remove the default so I do this here - if not len(args.lstrMethods): - args.lstrMethods = [ConstantsMicropita.c_strRepresentative] + dictSelectedSamples = microPITA.funcRun( + strIDName=args.strIDName, + strLastMetadataName=args.strLastMetadataName, + istmInput=args.istmInput, + ostmInputPredictFile=args.ostmInputPredictFile, + ostmPredictFile=args.ostmPredictFile, + ostmCheckedFile=args.ostmCheckedFile, + ostmOutput=args.ostmOutput, + cDelimiter=args.cFileDelimiter, + cFeatureNameDelimiter=args.cFeatureNameDelimiter, + istmFeatures=args.istmFeatures, + strFeatureSelection=args.strFeatureSelection, + iCount=args.iCount, + strLastRowMetadata=args.strLastFeatureMetadata, + strLabel=args.strLabel, + strStratify=args.strUnsupervisedStratify, + strCustomAlpha=args.strAlphaDiversity, + strCustomBeta=args.strBetaDiversity, + strAlphaMetadata=args.strAlphaMetadata, + istmBetaMatrix=args.istmBetaMatrix, + istrmTree=args.istrmTree, + istrmEnvr=args.istrmEnvr, + lstrMethods=args.lstrMethods, + fInvertDiversity=args.fInvertDiversity + ) - dictSelectedSamples = microPITA.funcRun( - strIDName = args.strIDName, - strLastMetadataName = args.strLastMetadataName, - istmInput = args.istmInput, - ostmInputPredictFile = args.ostmInputPredictFile, - ostmPredictFile = args.ostmPredictFile, - ostmCheckedFile = args.ostmCheckedFile, - ostmOutput = args.ostmOutput, - cDelimiter = args.cFileDelimiter, - cFeatureNameDelimiter = args.cFeatureNameDelimiter, - istmFeatures = args.istmFeatures, - strFeatureSelection = args.strFeatureSelection, - iCount = args.iCount, - strLastRowMetadata = args.strLastFeatureMetadata, - strLabel = args.strLabel, - strStratify = args.strUnsupervisedStratify, - strCustomAlpha = args.strAlphaDiversity, - strCustomBeta = args.strBetaDiversity, - strAlphaMetadata = args.strAlphaMetadata, - istmBetaMatrix = args.istmBetaMatrix, - istrmTree = args.istrmTree, - istrmEnvr = args.istrmEnvr, - lstrMethods = args.lstrMethods, - fInvertDiversity = args.fInvertDiversity - ) + if not dictSelectedSamples: + logging.error("MicroPITA:: Error, did not get a result from analysis.") + return -1 + logging.info("End microPITA") - if not dictSelectedSamples: - logging.error("MicroPITA:: Error, did not get a result from analysis.") - return -1 - logging.info("End microPITA") + # Log output for debugging + logging.debug("MicroPITA:: Returned the following samples:" + + str(dictSelectedSamples)) - #Log output for debugging - logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples)) + # Write selection to file + microPITA.funcWriteSelectionToFile( + dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput) - #Write selection to file - microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput) if __name__ == "__main__": - _main( ) + _main()