comparison MicroPITA.py @ 28:1d09ffab87a7 draft

Uploaded MicroPITA.py - fixed spaces and tabs inconsistencies
author george-weingart
date Tue, 22 Jun 2021 03:23:17 +0000
parents 7d25ecd225dd
children
comparison
equal deleted inserted replaced
27:d9862a9a4d84 28:1d09ffab87a7
3 Author: Timothy Tickle 3 Author: Timothy Tickle
4 Description: Class to Run analysis for the microPITA paper 4 Description: Class to Run analysis for the microPITA paper
5 """ 5 """
6 6
7 ##################################################################################### 7 #####################################################################################
8 #Copyright (C) <2012> 8 # Copyright (C) <2012>
9 # 9 #
10 #Permission is hereby granted, free of charge, to any person obtaining a copy of 10 # Permission is hereby granted, free of charge, to any person obtaining a copy of
11 #this software and associated documentation files (the "Software"), to deal in the 11 # this software and associated documentation files (the "Software"), to deal in the
12 #Software without restriction, including without limitation the rights to use, copy, 12 # Software without restriction, including without limitation the rights to use, copy,
13 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 13 # modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
14 #and to permit persons to whom the Software is furnished to do so, subject to 14 # and to permit persons to whom the Software is furnished to do so, subject to
15 #the following conditions: 15 # the following conditions:
16 # 16 #
17 #The above copyright notice and this permission notice shall be included in all copies 17 # The above copyright notice and this permission notice shall be included in all copies
18 #or substantial portions of the Software. 18 # or substantial portions of the Software.
19 # 19 #
20 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
21 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 21 # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
22 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 ##################################################################################### 26 #####################################################################################
27 27
28 from types import *
29 import scipy.spatial.distance
30 import scipy.cluster.hierarchy as hcluster
31 import random
32 import os
33 import operator
34 import numpy as np
35 import mlpy
36 import math
37 import logging
38 import csv
39 from src.ConstantsMicropita import ConstantsMicropita
40 from src.breadcrumbs.src.UtilityMath import UtilityMath
41 from src.breadcrumbs.src.SVM import SVM
42 from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor
43 from src.breadcrumbs.src.KMedoids import Kmedoids
44 from src.breadcrumbs.src.Metric import Metric
45 from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs
46 from src.breadcrumbs.src.AbundanceTable import AbundanceTable
28 __author__ = "Timothy Tickle" 47 __author__ = "Timothy Tickle"
29 __copyright__ = "Copyright 2012" 48 __copyright__ = "Copyright 2012"
30 __credits__ = ["Timothy Tickle"] 49 __credits__ = ["Timothy Tickle"]
31 __license__ = "MIT" 50 __license__ = "MIT"
32 __maintainer__ = "Timothy Tickle" 51 __maintainer__ = "Timothy Tickle"
33 __email__ = "ttickle@sph.harvard.edu" 52 __email__ = "ttickle@sph.harvard.edu"
34 __status__ = "Development" 53 __status__ = "Development"
35 54
36 import sys 55 import sys
37 import argparse 56 import argparse
38 from src.breadcrumbs.src.AbundanceTable import AbundanceTable
39 import warnings 57 import warnings
40 warnings.simplefilter(action = "ignore", category = FutureWarning) 58 warnings.simplefilter(action="ignore", category=FutureWarning)
41 from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs 59
42 from src.breadcrumbs.src.Metric import Metric
43 from src.breadcrumbs.src.KMedoids import Kmedoids
44 from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor
45 from src.breadcrumbs.src.SVM import SVM
46 from src.breadcrumbs.src.UtilityMath import UtilityMath
47
48 from src.ConstantsMicropita import ConstantsMicropita
49 import csv
50 import logging
51 import math
52 import mlpy
53 import numpy as np
54 import operator
55 import os
56 import random
57 import scipy.cluster.hierarchy as hcluster
58 import scipy.spatial.distance
59 from types import *
60 60
61 class MicroPITA: 61 class MicroPITA:
62 """ 62 """
63 Selects samples from a first tier of a multi-tiered study to be used in a second tier. 63 Selects samples from a first tier of a multi-tiered study to be used in a second tier.
64 Different methods can be used for selection. 64 Different methods can be used for selection.
65 The expected input is an abundance table (and potentially a text file of targeted features, 65 The expected input is an abundance table (and potentially a text file of targeted features,
66 if using the targeted features option). Output is a list of samples exhibiting the 66 if using the targeted features option). Output is a list of samples exhibiting the
67 characteristics of interest. 67 characteristics of interest.
68 """ 68 """
69 69
70 #Constants 70 # Constants
71 #Diversity metrics Alpha 71 # Diversity metrics Alpha
72 c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity 72 c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity
73 c_strChao1Diversity = Metric.c_strChao1Diversity 73 c_strChao1Diversity = Metric.c_strChao1Diversity
74 74
75 #Diversity metrics Beta 75 # Diversity metrics Beta
76 c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity 76 c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity
77 77
78 #Additive inverses of diversity metrics beta 78 # Additive inverses of diversity metrics beta
79 c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity 79 c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity
80 80
81 #Technique Names 81 # Technique Names
82 ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C" 82 ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C"
83 83
84 #Targeted feature settings 84 # Targeted feature settings
85 c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked 85 c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked
86 c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance 86 c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance
87 87
88 #Technique groupings 88 # Technique groupings
89 # c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2] 89 # c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2]
90 90
91 #Converts ecology metrics into standardized method selection names 91 # Converts ecology metrics into standardized method selection names
92 dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2} 92 dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity: ConstantsMicropita.c_strDiversity,
93 c_strChao1Diversity: ConstantsMicropita.c_strDiversity2}
93 # dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity} 94 # dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity}
94 dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative} 95 dictConvertBMetricToMethod = {
95 dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme} 96 c_strBrayCurtisDissimilarity: ConstantsMicropita.c_strRepresentative}
96 97 dictConvertInvBMetricToMethod = {
97 #Linkage used in the Hierarchical clustering 98 c_strBrayCurtisDissimilarity: ConstantsMicropita.c_strExtreme}
98 c_strHierarchicalClusterMethod = 'average' 99
99 100 # Linkage used in the Hierarchical clustering
100 ####Group 1## Diversity 101 c_strHierarchicalClusterMethod = 'average'
101 #Testing: Happy path Testing (8) 102
102 def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None): 103 # Group 1## Diversity
103 """ 104 # Testing: Happy path Testing (8)
104 Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given 105 def funcGetTopRankedSamples(self, lldMatrix=None, lsSampleNames=None, iTopAmount=None):
105 it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample 106 """
106 names associated with the indices. 107 Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given
107 108 it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample
108 :param lldMatrix: List of lists [[value,value,value,value],[value,value,value,value]]. 109 names associated with the indices.
109 :type: List of lists List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample. 110
110 :param lsSampleNames: List of sample names positionally related (the same) to each list (Optional). 111 :param lldMatrix: List of lists [[value,value,value,value],[value,value,value,value]].
111 :type: List of strings List of strings. 112 :type: List of lists List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample.
112 :param iTopAmount: The amount of top measured samples (assumes the higher measurements are better). 113 :param lsSampleNames: List of sample names positionally related (the same) to each list (Optional).
113 :type: integer Integer amount of sample names/ indices to return. 114 :type: List of strings List of strings.
114 :return List: List of samples to be selected. 115 :param iTopAmount: The amount of top measured samples (assumes the higher measurements are better).
115 """ 116 :type: integer Integer amount of sample names/ indices to return.
116 topRankListRet = [] 117 :return List: List of samples to be selected.
117 for rowMetrics in lldMatrix: 118 """
118 #Create 2 d array to hold value and index and sort 119 topRankListRet = []
119 liIndexX = [rowMetrics,range(len(rowMetrics))] 120 for rowMetrics in lldMatrix:
120 liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True) 121 # Create 2 d array to hold value and index and sort
121 122 liIndexX = [rowMetrics, range(len(rowMetrics))]
122 if lsSampleNames: 123 liIndexX[1].sort(key=liIndexX[0].__getitem__, reverse=True)
123 topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]]) 124
124 else: 125 if lsSampleNames:
125 topRankListRet.append(liIndexX[1][:iTopAmount]) 126 topRankListRet.append([lsSampleNames[iIndex]
126 127 for iIndex in liIndexX[1][:iTopAmount]])
127 return topRankListRet 128 else:
128 129 topRankListRet.append(liIndexX[1][:iTopAmount])
129 ####Group 2## Representative Dissimilarity 130
130 #Testing: Happy path tested 1 131 return topRankListRet
131 def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): 132
132 """ 133 # Group 2## Representative Dissimilarity
133 Gets centroid samples by k-medoids clustering of a given matrix. 134 # Testing: Happy path tested 1
134 135 def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
135 :param npaMatrix: Numpy array where row=features and columns=samples 136 """
136 :type: Numpy array Abundance Data. 137 Gets centroid samples by k-medoids clustering of a given matrix.
137 :param sMetric: String name of beta metric used as the distance metric. 138
138 :type: String String name of beta metric. 139 :param npaMatrix: Numpy array where row=features and columns=samples
139 :param lsSampleNames: The names of the sample 140 :type: Numpy array Abundance Data.
140 :type: List List of strings 141 :param sMetric: String name of beta metric used as the distance metric.
141 :param iNumberSamplesReturned: Number of samples to return, each will be a centroid of a sample. 142 :type: String String name of beta metric.
142 :type: Integer Number of samples to return 143 :param lsSampleNames: The names of the sample
143 :return List: List of selected samples. 144 :type: List List of strings
144 :param istmBetaMatrix: File with beta-diversity matrix 145 :param iNumberSamplesReturned: Number of samples to return, each will be a centroid of a sample.
145 :type: File stream or file path string 146 :type: Integer Number of samples to return
146 """ 147 :return List: List of selected samples.
147 148 :param istmBetaMatrix: File with beta-diversity matrix
148 #Count of how many rows 149 :type: File stream or file path string
149 sampleCount = npaMatrix.shape[0] 150 """
150 if iNumberSamplesReturned > sampleCount: 151
151 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".") 152 # Count of how many rows
152 return False 153 sampleCount = npaMatrix.shape[0]
153 154 if iNumberSamplesReturned > sampleCount:
154 #If the cluster count is equal to the sample count return all samples 155 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = " +
155 if sampleCount == iNumberSamplesReturned: 156 str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".")
156 return list(lsSampleNames) 157 return False
157 158
158 #Get distance matrix 159 # If the cluster count is equal to the sample count return all samples
159 distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames) 160 if sampleCount == iNumberSamplesReturned:
160 if type(distanceMatrix) is BooleanType: 161 return list(lsSampleNames)
161 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.") 162
162 return False 163 # Get distance matrix
163 164 distanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix, lsSampleOrder=lsSampleNames)[
164 # Handle unifrac output 165 0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames)
165 if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: 166 if type(distanceMatrix) is BooleanType:
166 distanceMatrix = distanceMatrix[0] 167 logging.error(
167 168 "MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.")
168 #Log distance matrix 169 return False
169 logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric)) 170
170 171 # Handle unifrac output
171 distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True) 172 if sMetric in [Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted]:
172 173 distanceMatrix = distanceMatrix[0]
173 #Create object to determine clusters/medoids 174
174 medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance) 175 # Log distance matrix
175 #medoidsData includes(1d numpy array, medoids indexes; 176 logging.debug(
176 # 1d numpy array, non-medoids indexes; 177 "MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric))
177 # 1d numpy array, cluster membership for non-medoids; 178
178 # double, cost of configuration) 179 distance = MLPYDistanceAdaptor(
179 #npaMatrix is samples x rows 180 npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True)
180 #Build a matrix of lists of indicies to pass to the distance matrix 181
181 lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))] 182 # Create object to determine clusters/medoids
182 medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix)) 183 medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance)
183 logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:") 184 # medoidsData includes(1d numpy array, medoids indexes;
184 logging.debug(str(medoidsData)) 185 # 1d numpy array, non-medoids indexes;
185 186 # 1d numpy array, cluster membership for non-medoids;
186 #If returning the same amount of clusters and samples 187 # double, cost of configuration)
187 #Return centroids 188 # npaMatrix is samples x rows
188 selectedIndexes = medoidsData[0] 189 # Build a matrix of lists of indicies to pass to the distance matrix
189 return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)] 190 lliIndicesMatrix = [[iIndexPosition]
190 191 for iIndexPosition in xrange(0, len(npaMatrix))]
191 ####Group 3## Highest Dissimilarity 192 medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix))
192 #Testing: Happy path tested 193 logging.debug(
193 def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): 194 "MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:")
194 """ 195 logging.debug(str(medoidsData))
195 Select extreme samples from HClustering. 196
196 197 # If returning the same amount of clusters and samples
197 :param strBetaMetric: The beta metric to use for distance matrix generation. 198 # Return centroids
198 :type: String The name of the beta metric to use. 199 selectedIndexes = medoidsData[0]
199 :param npaAbundanceMatrix: Numpy array where row=samples and columns=features. 200 return [lsSampleNames[selectedIndexes[index]] for index in xrange(0, iNumberSamplesReturned)]
200 :type: Numpy Array Abundance data. 201
201 :param lsSampleNames: The names of the sample. 202 # Group 3## Highest Dissimilarity
202 :type: List List of strings. 203 # Testing: Happy path tested
203 :param iSelectSampleCount: Number of samples to select (return). 204 def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
204 :type: Integer Integer number of samples returned. 205 """
205 :return Samples: List of samples. 206 Select extreme samples from HClustering.
206 :param istmBetaMatrix: File with beta-diversity matrix 207
207 :type: File stream or file path string 208 :param strBetaMetric: The beta metric to use for distance matrix generation.
208 """ 209 :type: String The name of the beta metric to use.
209 210 :param npaAbundanceMatrix: Numpy array where row=samples and columns=features.
210 #If they want all the sample count, return all sample names 211 :type: Numpy Array Abundance data.
211 iSampleCount=len(npaAbundanceMatrix[:,0]) 212 :param lsSampleNames: The names of the sample.
212 if iSelectSampleCount==iSampleCount: 213 :type: List List of strings.
213 return lsSampleNames 214 :param iSelectSampleCount: Number of samples to select (return).
214 215 :type: Integer Integer number of samples returned.
215 #Holds the samples to be returned 216 :return Samples: List of samples.
216 lsReturnSamplesRet = [] 217 :param istmBetaMatrix: File with beta-diversity matrix
217 218 :type: File stream or file path string
218 #Generate beta matrix 219 """
219 #Returns condensed matrix 220
220 tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True) 221 # If they want all the sample count, return all sample names
221 222 iSampleCount = len(npaAbundanceMatrix[:, 0])
222 if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: 223 if iSelectSampleCount == iSampleCount:
223 tempDistanceMatrix = tempDistanceMatrix[0] 224 return lsSampleNames
224 225
225 if type(tempDistanceMatrix) is BooleanType: 226 # Holds the samples to be returned
226 logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.") 227 lsReturnSamplesRet = []
227 return False 228
228 229 # Generate beta matrix
229 if istmBetaMatrix: 230 # Returns condensed matrix
230 tempDistanceMatrix = 1-tempDistanceMatrix 231 tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix, lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(
231 232 npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse=True)
232 #Feed beta matrix to linkage to cluster 233
233 #Send condensed matrix 234 if strBetaMetric in [Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted]:
234 linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod) 235 tempDistanceMatrix = tempDistanceMatrix[0]
235 236
236 #Extract cluster information from dendrogram 237 if type(tempDistanceMatrix) is BooleanType:
237 #The linakge matrix is of the form 238 logging.error(
238 #[[int1 int2 doube int3],...] 239 "MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.")
239 #int1 and int1 are the paired samples indexed at 0 and up. 240 return False
240 #each list is an entry for a branch that is number starting with the first 241
241 #list being sample count index + 1 242 if istmBetaMatrix:
242 #each list is then named by an increment as they appear 243 tempDistanceMatrix = 1-tempDistanceMatrix
243 #this means that if a number is in the list and is = sample count or greater it is not 244
244 #terminal and is instead a branch. 245 # Feed beta matrix to linkage to cluster
245 #This method just takes the lowest metric measurement (highest distance pairs/clusters) 246 # Send condensed matrix
246 #Works much better than the original technique 247 linkageMatrix = hcluster.linkage(
247 #get total number of samples 248 tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod)
248 249
249 iCurrentSelectCount = 0 250 # Extract cluster information from dendrogram
250 for row in linkageMatrix: 251 # The linakge matrix is of the form
251 #Get nodes ofthe lowest pairing (so the furthest apart pair) 252 # [[int1 int2 doube int3],...]
252 iNode1 = int(row[0]) 253 # int1 and int1 are the paired samples indexed at 0 and up.
253 iNode2 = int(row[1]) 254 # each list is an entry for a branch that is number starting with the first
254 #Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram 255 # list being sample count index + 1
255 #The branching in the dendrogram will start at the number of samples and increment higher. 256 # each list is then named by an increment as they appear
256 #Add each of the pair one at a time breaking when enough samples are selected. 257 # this means that if a number is in the list and is = sample count or greater it is not
257 if iNode1<iSampleCount: 258 # terminal and is instead a branch.
258 lsReturnSamplesRet.append(lsSampleNames[iNode1]) 259 # This method just takes the lowest metric measurement (highest distance pairs/clusters)
259 iCurrentSelectCount = iCurrentSelectCount + 1 260 # Works much better than the original technique
260 if iCurrentSelectCount == iSelectSampleCount: 261 # get total number of samples
261 break 262
262 if iNode2<iSampleCount: 263 iCurrentSelectCount = 0
263 lsReturnSamplesRet.append(lsSampleNames[iNode2]) 264 for row in linkageMatrix:
264 iCurrentSelectCount = iCurrentSelectCount + 1 265 # Get nodes ofthe lowest pairing (so the furthest apart pair)
265 if iCurrentSelectCount == iSelectSampleCount: 266 iNode1 = int(row[0])
266 break 267 iNode2 = int(row[1])
267 268 # Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram
268 #Return selected samples 269 # The branching in the dendrogram will start at the number of samples and increment higher.
269 return lsReturnSamplesRet 270 # Add each of the pair one at a time breaking when enough samples are selected.
270 271 if iNode1 < iSampleCount:
271 ####Group 4## Rank Average of user Defined Taxa 272 lsReturnSamplesRet.append(lsSampleNames[iNode1])
272 #Testing: Happy Path Tested 273 iCurrentSelectCount = iCurrentSelectCount + 1
273 def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False): 274 if iCurrentSelectCount == iSelectSampleCount:
274 """ 275 break
275 Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped. 276 if iNode2 < iSampleCount:
276 277 lsReturnSamplesRet.append(lsSampleNames[iNode2])
277 :param abndTable: Abundance Table to analyse 278 iCurrentSelectCount = iCurrentSelectCount + 1
278 :type: AbundanceTable Abundance Table 279 if iCurrentSelectCount == iSelectSampleCount:
279 :param lsTargetedFeature: String names 280 break
280 :type: list list of string names of features (bugs) which are measured after ranking against the full sample 281
281 :param fRank: Indicates to rank the abundance before getting the average abundance of the features (default false) 282 # Return selected samples
282 :type: boolean Flag indicating ranking abundance before calculating average feature measurement (false= no ranking) 283 return lsReturnSamplesRet
283 :return List of lists or boolean: List of lists or False on error. One internal list per sample indicating the sample, 284
284 feature average abundance or ranked abundance. Lists will already be sorted. 285 # Group 4## Rank Average of user Defined Taxa
285 For not Ranked [[sample,average abundance of selected feature,1]] 286 # Testing: Happy Path Tested
286 For Ranked [[sample,average ranked abundance, average abundance of selected feature]] 287 def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False):
287 Error Returns false 288 """
288 """ 289 Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped.
289 290
290 llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature) 291 :param abndTable: Abundance Table to analyse
291 if not llAbundance: 292 :type: AbundanceTable Abundance Table
292 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") 293 :param lsTargetedFeature: String names
293 return False 294 :type: list list of string names of features (bugs) which are measured after ranking against the full sample
294 #Add a space for ranking if needed 295 :param fRank: Indicates to rank the abundance before getting the average abundance of the features (default false)
295 #Not ranked will be [[sSample,average abundance,1]] 296 :type: boolean Flag indicating ranking abundance before calculating average feature measurement (false= no ranking)
296 #(where 1 will not discriminant ties if used in later functions, so this generalizes) 297 :return List of lists or boolean: List of lists or False on error. One internal list per sample indicating the sample,
297 #Ranked will be [[sSample, average rank, average abundance]] 298 feature average abundance or ranked abundance. Lists will already be sorted.
298 llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance] 299 For not Ranked [[sample,average abundance of selected feature,1]]
299 #Rank if needed 300 For Ranked [[sample,average ranked abundance, average abundance of selected feature]]
300 if fRank: 301 Error Returns false
301 abndRanked = abndTable.funcRankAbundance() 302 """
302 if abndRanked == None: 303
303 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.") 304 llAbundance = abndTable.funcGetAverageAbundancePerSample(
304 return False 305 lsTargetedFeature)
305 llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature) 306 if not llAbundance:
306 if not llRetRank: 307 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
307 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") 308 return False
308 return False 309 # Add a space for ranking if needed
309 dictRanks = dict(llRetRank) 310 # Not ranked will be [[sSample,average abundance,1]]
310 llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance] 311 # (where 1 will not discriminant ties if used in later functions, so this generalizes)
311 312 # Ranked will be [[sSample, average rank, average abundance]]
312 #Sort first for ties and then for the main feature 313 llRetAbundance = [[llist[0], -1, llist[1]] for llist in llAbundance]
313 if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity: 314 # Rank if needed
314 llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank) 315 if fRank:
315 if fRank: 316 abndRanked = abndTable.funcRankAbundance()
316 llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank) 317 if abndRanked == None:
317 return llRetAbundance 318 logging.error(
318 319 "MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.")
319 #Testing: Happy Path Tested 320 return False
320 def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]): 321 llRetRank = abndRanked.funcGetAverageAbundancePerSample(
321 """ 322 lsTargetedFeature)
322 Selects samples with the highest ranks or abundance of targeted features. 323 if not llRetRank:
323 If ranked, select the highest abundance for tie breaking 324 logging.error(
324 325 "MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
325 :param abndMatrix: Abundance table to analyse 326 return False
326 :type: AbundanceTable Abundance table 327 dictRanks = dict(llRetRank)
327 :param lsTargetedTaxa: List of features 328 llRetAbundance = [[a[0], dictRanks[a[0]], a[2]]
328 :type: list list of strings 329 for a in llRetAbundance]
329 :param iSampleSelectionCount: Number of samples to select 330
330 :type: integer integer 331 # Sort first for ties and then for the main feature
331 :param sMethod: Method to select targeted features 332 if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity:
332 :type: string String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues) 333 llRetAbundance = sorted(
333 :return List of strings: List of sample names which were selected 334 llRetAbundance, key=lambda sampleData: sampleData[2], reverse=not fRank)
334 List of strings Empty list is returned on an error. 335 if fRank:
335 """ 336 llRetAbundance = sorted(
336 337 llRetAbundance, key=lambda sampleData: sampleData[1], reverse=not fRank)
337 #Check data 338 return llRetAbundance
338 if(len(lsTargetedTaxa) < 1): 339
339 logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.") 340 # Testing: Happy Path Tested
340 return [] 341 def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod=ConstantsMicropita.lsTargetedFeatureMethodValues[0]):
341 342 """
342 lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa, 343 Selects samples with the highest ranks or abundance of targeted features.
343 fRank=sMethod.lower() == self.c_strTargetedRanked.lower()) 344 If ranked, select the highest abundance for tie breaking
344 #If an error occured or the key word for the method was not recognized 345
345 if lsTargetedSamples == False: 346 :param abndMatrix: Abundance table to analyse
346 logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.") 347 :type: AbundanceTable Abundance table
347 return [] 348 :param lsTargetedTaxa: List of features
348 349 :type: list list of strings
349 #Select from results 350 :param iSampleSelectionCount: Number of samples to select
350 return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]] 351 :type: integer integer
351 352 :param sMethod: Method to select targeted features
352 ####Group 5## Random 353 :type: string String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues)
353 #Testing: Happy path Tested 354 :return List of strings: List of sample names which were selected
354 def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0): 355 List of strings Empty list is returned on an error.
355 """ 356 """
356 Returns random sample names of the number given. No replacement. 357
357 358 # Check data
358 :param lsSamples: List of sample names 359 if(len(lsTargetedTaxa) < 1):
359 :type: list list of strings 360 logging.error(
360 :param iNumberOfSamplesToReturn: Number of samples to select 361 "MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.")
361 :type: integer integer. 362 return []
362 :return List: List of selected samples (strings). 363
363 """ 364 lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa,
364 365 fRank=sMethod.lower() == self.c_strTargetedRanked.lower())
365 #Input matrix sample count 366 # If an error occured or the key word for the method was not recognized
366 sampleCount = len(lsSamples) 367 if lsTargetedSamples == False:
367 368 logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.")
368 #Return the full matrix if they ask for a return matrix where length == original 369 return []
369 if(iNumberOfSamplesToReturn >= sampleCount): 370
370 return lsSamples 371 # Select from results
371 372 return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]]
372 #Get the random indices for the sample (without replacement) 373
373 liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn) 374 # Group 5## Random
374 375 # Testing: Happy path Tested
375 #Create a boolean array of if indexes are to be included in the reduced array 376 def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0):
376 return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices] 377 """
377 378 Returns random sample names of the number given. No replacement.
378 #Happy path tested (case 3) 379
379 def funcGetAveragePopulation(self, abndTable, lfCompress): 380 :param lsSamples: List of sample names
380 """ 381 :type: list list of strings
381 Get the average row per column in the abndtable. 382 :param iNumberOfSamplesToReturn: Number of samples to select
382 383 :type: integer integer.
383 :param abndTable: AbundanceTable of data to be averaged 384 :return List: List of selected samples (strings).
384 :type: AbudanceTable 385 """
385 :param lfCompress: List of boolean flags (false means to remove sample before averaging 386
386 :type: List of floats 387 # Input matrix sample count
387 :return List of doubles: 388 sampleCount = len(lsSamples)
388 """ 389
389 if sum(lfCompress) == 0: 390 # Return the full matrix if they ask for a return matrix where length == original
390 return [] 391 if(iNumberOfSamplesToReturn >= sampleCount):
391 392 return lsSamples
392 #Get the average populations 393
393 lAverageRet = [] 394 # Get the random indices for the sample (without replacement)
394 395 liRandomIndices = random.sample(
395 for sFeature in abndTable.funcGetAbundanceCopy(): 396 range(sampleCount), iNumberOfSamplesToReturn)
396 sFeature = list(sFeature)[1:] 397
397 sFeature=np.compress(lfCompress,sFeature,axis=0) 398 # Create a boolean array of if indexes are to be included in the reduced array
398 lAverageRet.append(sum(sFeature)/float(len(sFeature))) 399 return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices]
399 return lAverageRet 400
400 401 # Happy path tested (case 3)
401 #Happy path tested (2 cases) 402 def funcGetAveragePopulation(self, abndTable, lfCompress):
402 def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected): 403 """
403 """ 404 Get the average row per column in the abndtable.
404 Given an abundance table and an average sample, this returns the distance of each sample 405
405 (measured using brays-curtis dissimilarity) from the average. 406 :param abndTable: AbundanceTable of data to be averaged
406 The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected 407 :type: AbudanceTable
407 (which is associated with the samples in the order of the samples in the abundance table; 408 :param lfCompress: List of boolean flags (false means to remove sample before averaging
408 use abundancetable.funcGetSampleNames() to see the order if needed). 409 :type: List of floats
409 410 :return List of doubles:
410 :param abndTable: Abundance table holding the data to be analyzed. 411 """
411 :type: AbundanceTable 412 if sum(lfCompress) == 0:
412 :param ldAverage: Average population (Average features of the abundance table of samples) 413 return []
413 :type: List of doubles which represent the average population 414
414 :param lsSamples: These are the only samples used in the analysis 415 # Get the average populations
415 :type: List of strings (sample ids) 416 lAverageRet = []
416 :param lfSelected: Samples to be included in the analysis 417
417 :type: List of boolean (true means include) 418 for sFeature in abndTable.funcGetAbundanceCopy():
418 :return: List of distances (doubles) 419 sFeature = list(sFeature)[1:]
419 """ 420 sFeature = np.compress(lfCompress, sFeature, axis=0)
420 #Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists 421 lAverageRet.append(sum(sFeature)/float(len(sFeature)))
421 ldSelectedDistances = [] 422 return lAverageRet
422 423
423 for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]: 424 # Happy path tested (2 cases)
424 #Get the sample measurements 425 def funcGetDistanceFromAverage(self, abndTable, ldAverage, lsSamples, lfSelected):
425 ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0]) 426 """
426 return ldSelectedDistances 427 Given an abundance table and an average sample, this returns the distance of each sample
427 428 (measured using brays-curtis dissimilarity) from the average.
428 #Happy path tested (1 case) 429 The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected
429 def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther): 430 (which is associated with the samples in the order of the samples in the abundance table;
430 """ 431 use abundancetable.funcGetSampleNames() to see the order if needed).
431 Get the distance of samples from one label from the average sample of not the label. 432
432 Note: This assumes 2 classes. 433 :param abndTable: Abundance table holding the data to be analyzed.
433 434 :type: AbundanceTable
434 :param abndTable: Table of data to work out of. 435 :param ldAverage: Average population (Average features of the abundance table of samples)
435 :type: Abundace Table 436 :type: List of doubles which represent the average population
436 :param lfGroupOfInterest: Boolean indicator of the sample being in the first group. 437 :param lsSamples: These are the only samples used in the analysis
437 :type: List of floats, true indicating an individual in the group of interest. 438 :type: List of strings (sample ids)
438 :param lfGroupOther: Boolean indicator of the sample being in the other group. 439 :param lfSelected: Samples to be included in the analysis
439 :type: List of floats, true indicating an individual in the 440 :type: List of boolean (true means include)
440 :return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population] 441 :return: List of distances (doubles)
441 """ 442 """
442 #Get all sample names 443 # Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists
443 lsAllSamples = abndTable.funcGetSampleNames() 444 ldSelectedDistances = []
444 445
445 #Get average populations 446 for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]:
446 lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther) 447 # Get the sample measurements
447 448 ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(
448 #Get the distance from the average of the other label (label 1) 449 np.array([abndTable.funcGetSample(sSampleName), ldAverage]))[0])
449 ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther, 450 return ldSelectedDistances
450 lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest) 451
451 452 # Happy path tested (1 case)
452 return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances) 453 def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther):
453 454 """
454 #Happy path tested (1 test case) 455 Get the distance of samples from one label from the average sample of not the label.
455 def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest): 456 Note: This assumes 2 classes.
456 """ 457
457 Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group. 458 :param abndTable: Table of data to work out of.
458 An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group. 459 :type: Abundace Table
459 460 :param lfGroupOfInterest: Boolean indicator of the sample being in the first group.
460 :params abndTable: Abundance of measurements 461 :type: List of floats, true indicating an individual in the group of interest.
461 :type: AbundanceTable 462 :param lfGroupOther: Boolean indicator of the sample being in the other group.
462 :params iSelectionCount: The number of samples selected per sample. 463 :type: List of floats, true indicating an individual in the
463 :type: Integer Integer greater than 0 464 :return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population]
464 :params sLabel: ID of the metadata which is the supervised label 465 """
465 :type: String 466 # Get all sample names
466 :params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest. 467 lsAllSamples = abndTable.funcGetSampleNames()
467 :type: String found in the abundance table metadata row indicated by sLabel. 468
468 :return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]] 469 # Get average populations
469 """ 470 lAverageOther = self.funcGetAveragePopulation(
470 471 abndTable=abndTable, lfCompress=lfGroupOther)
471 lsMetadata = abndTable.funcGetMetadata(sLabel) 472
472 #Other metadata values 473 # Get the distance from the average of the other label (label 1)
473 lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest)) 474 ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther,
474 475 lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest)
475 #Get boolean indicator of values of interest 476
476 lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata] 477 return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup], ldSelectedDistances)
477 478
478 #Get the distances of the items of interest from the other metadata values 479 # Happy path tested (1 test case)
479 dictDistanceAverages = {} 480 def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest):
480 for sOtherLabel in lsUniqueOtherValues: 481 """
481 #Get boolean indicator of labels not of interest 482 Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group.
482 lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata] 483 An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group.
483 484
484 #Get the distances of data from two different groups to the average of the other 485 :params abndTable: Abundance of measurements
485 ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther)) 486 :type: AbundanceTable
486 487 :params iSelectionCount: The number of samples selected per sample.
487 for sKey in ldValueDistances: 488 :type: Integer Integer greater than 0
488 dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey] 489 :params sLabel: ID of the metadata which is the supervised label
489 490 :type: String
490 #Finish average by dividing by length of lsUniqueOtherValues 491 :params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest.
491 ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages] 492 :type: String found in the abundance table metadata row indicated by sLabel.
492 493 :return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]]
493 #Sort to extract extremes 494 """
494 ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1)) 495
495 496 lsMetadata = abndTable.funcGetMetadata(sLabel)
496 #Get the closest and farthest distances 497 # Other metadata values
497 ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount] 498 lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest))
498 ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:] 499
499 500 # Get boolean indicator of values of interest
500 #Remove the selected samples from the larger population of distances (better visualization) 501 lfLabelsInterested = [sValueOfInterest ==
501 ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples] 502 sValue for sValue in lsMetadata]
502 503
503 #Return discriminant tuples, distinct tuples, other tuples 504 # Get the distances of the items of interest from the other metadata values
504 return [ltupleDiscriminantSamples, ltupleDistinctSamples, 505 dictDistanceAverages = {}
505 [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]] 506 for sOtherLabel in lsUniqueOtherValues:
506 507 # Get boolean indicator of labels not of interest
507 #Run the supervised method surrounding distance from centroids 508 lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata]
508 #Happy path tested (3 test cases) 509
509 def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant, 510 # Get the distances of data from two different groups to the average of the other
510 xOutputSupFile, xPredictSupFile, strSupervisedMetadata, 511 ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(
511 iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False): 512 abndTable, lfLabelsInterested, lfLabelsOther))
512 """ 513
513 Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group. 514 for sKey in ldValueDistances:
514 515 dictDistanceAverages[sKey] = ldValueDistances[sKey] + \
515 :param abundanceTable: AbundanceTable 516 dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey]
516 :type: AbudanceTable Data to analyze 517
517 :param fRunDistinct: Run distinct selection method 518 # Finish average by dividing by length of lsUniqueOtherValues
518 :type: Boolean boolean (true runs method) 519 ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(
519 :param fRunDiscriminant: Run discriminant method 520 len(lsUniqueOtherValues))) for sKey in dictDistanceAverages]
520 :type: Boolean boolean (true runs method) 521
521 :param xOutputSupFile: File output from supervised methods detailing data going into the method. 522 # Sort to extract extremes
522 :type: String or FileStream 523 ltpleAverageDistances = sorted(
523 :param xPredictSupFile: File output from supervised methods distance results from supervised methods. 524 ltpleAverageDistances, key=operator.itemgetter(1))
524 :type: String or FileStream 525
525 :param strSupervisedMetadata: The metadata that will be used to group samples. 526 # Get the closest and farthest distances
526 :type: String 527 ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount]
527 :param iSampleSupSelectionCount: Number of samples to select 528 ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:]
528 :type: Integer int sample selection count 529
529 :param lsOriginalSampleNames: List of the sample names, order is important and should be preserved from the abundanceTable. 530 # Remove the selected samples from the larger population of distances (better visualization)
530 :type: List of samples 531 ldSelected = [tpleSelected[0]
531 :param fAppendFiles: Indicates that output files already exist and appending is occuring. 532 for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples]
532 :type: Boolean 533
533 :return Selected Samples: A dictionary of selected samples by selection ID 534 # Return discriminant tuples, distinct tuples, other tuples
534 Dictionary {"Selection Method":["SampleID","SampleID"...]} 535 return [ltupleDiscriminantSamples, ltupleDistinctSamples,
535 """ 536 [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]]
536 #Get labels and run one label against many 537
537 lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata) 538 # Run the supervised method surrounding distance from centroids
538 dictlltpleDistanceMeasurements = {} 539 # Happy path tested (3 test cases)
539 for sMetadataValue in set(lstrMetadata): 540 def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant,
540 541 xOutputSupFile, xPredictSupFile, strSupervisedMetadata,
541 #For now perform the selection here for the label of interest against the other labels 542 iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles=False):
542 dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable, 543 """
543 iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue)) 544 Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group.
544 545
545 #Make expected output files for supervised methods 546 :param abundanceTable: AbundanceTable
546 #1. Output file which is similar to an input file for SVMs 547 :type: AbudanceTable Data to analyze
547 #2. Output file that is similar to the probabilitic output of a SVM (LibSVM) 548 :param fRunDistinct: Run distinct selection method
548 #Manly for making output of supervised methods (Distance from Centroid) similar 549 :type: Boolean boolean (true runs method)
549 #MicropitaVis needs some of these files 550 :param fRunDiscriminant: Run discriminant method
550 if xOutputSupFile: 551 :type: Boolean boolean (true runs method)
551 if fAppendFiles: 552 :param xOutputSupFile: File output from supervised methods detailing data going into the method.
552 SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, 553 :type: String or FileStream
553 lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) 554 :param xPredictSupFile: File output from supervised methods distance results from supervised methods.
554 else: 555 :type: String or FileStream
555 SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, 556 :param strSupervisedMetadata: The metadata that will be used to group samples.
556 sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) 557 :type: String
557 558 :param iSampleSupSelectionCount: Number of samples to select
558 #Will contain the samples selected to return 559 :type: Integer int sample selection count
559 #One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type 560 :param lsOriginalSampleNames: List of the sample names, order is important and should be preserved from the abundanceTable.
560 dictSelectedSamplesRet = dict() 561 :type: List of samples
561 for sKey, ltplDistances in dictlltpleDistanceMeasurements.items(): 562 :param fAppendFiles: Indicates that output files already exist and appending is occuring.
562 if fRunDistinct: 563 :type: Boolean
563 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]]) 564 :return Selected Samples: A dictionary of selected samples by selection ID
564 if fRunDiscriminant: 565 Dictionary {"Selection Method":["SampleID","SampleID"...]}
565 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]]) 566 """
566 567 # Get labels and run one label against many
567 if xPredictSupFile: 568 lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata)
568 dictFlattenedDistances = dict() 569 dictlltpleDistanceMeasurements = {}
569 [dictFlattenedDistances.setdefault(sKey, []).append(tple) 570 for sMetadataValue in set(lstrMetadata):
570 for sKey, lltple in dictlltpleDistanceMeasurements.items() 571
571 for ltple in lltple for tple in ltple] 572 # For now perform the selection here for the label of interest against the other labels
572 if fAppendFiles: 573 dictlltpleDistanceMeasurements.setdefault(sMetadataValue, []).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable,
573 self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, 574 iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue))
574 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) 575
575 else: 576 # Make expected output files for supervised methods
576 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, 577 # 1. Output file which is similar to an input file for SVMs
577 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) 578 # 2. Output file that is similar to the probabilitic output of a SVM (LibSVM)
578 return dictSelectedSamplesRet 579 # Manly for making output of supervised methods (Distance from Centroid) similar
579 580 # MicropitaVis needs some of these files
580 #Two happy path test cases 581 if xOutputSupFile:
581 def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames): 582 if fAppendFiles:
582 """ 583 SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
583 Manages updating the predict file. 584 lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
584 585 else:
585 :param xPredictSupFile: File that has predictions (distances) from the supervised method. 586 SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
586 :type: FileStream or String file path 587 sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
587 :param xInputLabelsFile: File that as input to the supervised methods. 588
588 :type: FileStream or String file path 589 # Will contain the samples selected to return
589 :param dictltpleDistanceMeasurements: 590 # One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type
590 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} 591 dictSelectedSamplesRet = dict()
591 """ 592 for sKey, ltplDistances in dictlltpleDistanceMeasurements.items():
592 593 if fRunDistinct:
593 if not isinstance(xPredictSupFile, str): 594 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct, []).extend([
594 xPredictSupFile.close() 595 ltple[0] for ltple in ltplDistances[1]])
595 xPredictSupFile = xPredictSupFile.name 596 if fRunDiscriminant:
596 csvr = open(xPredictSupFile,'r') 597 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant, []).extend([
597 598 ltple[0] for ltple in ltplDistances[0]])
598 f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) 599
599 lsHeader = f.next()[1:] 600 if xPredictSupFile:
600 dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader]) 601 dictFlattenedDistances = dict()
601 602 [dictFlattenedDistances.setdefault(sKey, []).append(tple)
602 #Read data in 603 for sKey, lltple in dictlltpleDistanceMeasurements.items()
603 iSampleIndex = 0 604 for ltple in lltple for tple in ltple]
604 for sRow in f: 605 if fAppendFiles:
605 sLabel = sRow[0] 606 self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
606 [dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:]) 607 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
607 if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue] 608 else:
608 iSampleIndex += 1 609 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
609 610 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
610 #Combine dictltpleDistanceMeasurements with new data 611 return dictSelectedSamplesRet
611 #If they share a key then merge keeping parameter data 612
612 #If they do not share the key, keep the full data 613 # Two happy path test cases
613 dictNew = {} 614 def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames):
614 for sKey in dictltpleDistanceMeasurements.keys(): 615 """
615 lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]] 616 Manages updating the predict file.
616 dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey] 617
617 for sKey in dictlltpleRead: 618 :param xPredictSupFile: File that has predictions (distances) from the supervised method.
618 if sKey not in dictltpleDistanceMeasurements.keys(): 619 :type: FileStream or String file path
619 dictNew[sKey] = dictlltpleRead[sKey] 620 :param xInputLabelsFile: File that as input to the supervised methods.
620 621 :type: FileStream or String file path
621 #Call writer 622 :param dictltpleDistanceMeasurements:
622 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile, 623 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
623 dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable, 624 """
624 lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True) 625
625 626 if not isinstance(xPredictSupFile, str):
626 #2 happy path test cases 627 xPredictSupFile.close()
627 def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False): 628 xPredictSupFile = xPredictSupFile.name
628 """ 629 csvr = open(xPredictSupFile, 'r')
629 Write to the predict file. 630
630 631 f = csv.reader(
631 :param xPredictSupFile: File that has predictions (distances) from the supervised method. 632 csvr, delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
632 :type: FileStream or String file path 633 lsHeader = f.next()[1:]
633 :param xInputLabelsFile: File that as input to the supervised methods. 634 dictlltpleRead = dict([(sHeader, []) for sHeader in lsHeader])
634 :type: FileStream or String file path 635
635 :param dictltpleDistanceMeasurements: 636 # Read data in
636 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} 637 iSampleIndex = 0
637 :param abundanceTable: An abundance table of the sample data. 638 for sRow in f:
638 :type: AbundanceTable 639 sLabel = sRow[0]
639 :param lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing. 640 [dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex], dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:])
640 Otherwise will use the sample names from the abundance table. 641 if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue]
641 :type: List of strings 642 iSampleIndex += 1
642 :param fFromUpdate: Indicates if this is part of an update to the file or not. 643
643 :type: Boolean 644 # Combine dictltpleDistanceMeasurements with new data
644 """ 645 # If they share a key then merge keeping parameter data
645 646 # If they do not share the key, keep the full data
646 xInputLabelsFileName = xInputLabelsFile 647 dictNew = {}
647 if not isinstance(xInputLabelsFile,str): 648 for sKey in dictltpleDistanceMeasurements.keys():
648 xInputLabelsFileName = xInputLabelsFile.name 649 lsSamples = [tple[0]
649 f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) 650 for tple in dictltpleDistanceMeasurements[sKey]]
650 651 dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0]
651 lsAllSampleNames = abundanceTable.funcGetSampleNames() 652 not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey]
652 lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames, 653 for sKey in dictlltpleRead:
653 isPredictFile=False) 654 if sKey not in dictltpleDistanceMeasurements.keys():
654 dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]]) 655 dictNew[sKey] = dictlltpleRead[sKey]
655 656
656 #Dictionay keys will be used to order the predict file 657 # Call writer
657 lsMeasurementKeys = dictltpleDistanceMeasurements.keys() 658 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile,
658 #Make header 659 dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable,
659 f.writerow(["labels"]+lsMeasurementKeys) 660 lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True)
660 661
661 #Reformat dictionary to make it easier to use 662 # 2 happy path test cases
662 for sKey in dictltpleDistanceMeasurements: 663 def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False):
663 dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]]) 664 """
664 665 Write to the predict file.
665 for sSample in lsOriginalSampleNames: 666
666 #Make body of file 667 :param xPredictSupFile: File that has predictions (distances) from the supervised method.
667 f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+ 668 :type: FileStream or String file path
668 [str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)) 669 :param xInputLabelsFile: File that as input to the supervised methods.
669 for sKey in lsMeasurementKeys]) 670 :type: FileStream or String file path
670 671 :param dictltpleDistanceMeasurements:
671 def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics, 672 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
672 fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None, 673 :param abundanceTable: An abundance table of the sample data.
673 istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False): 674 :type: AbundanceTable
674 """ 675 :param lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing.
675 Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other 676 Otherwise will use the sample names from the abundance table.
676 for the set that should be normalized. 677 :type: List of strings
677 678 :param fFromUpdate: Indicates if this is part of an update to the file or not.
678 :param abndData: Abundance table object holding the samples to be measured. 679 :type: Boolean
679 :type: AbundanceTable 680 """
680 :param iSampleSelectionCount The number of samples to select per method. 681
681 :type: Integer 682 xInputLabelsFileName = xInputLabelsFile
682 :param dictSelectedSamples Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}. 683 if not isinstance(xInputLabelsFile, str):
683 :type: Dictionary 684 xInputLabelsFileName = xInputLabelsFile.name
684 :param lsAlphaMetrics: List of alpha metrics to use on alpha metric dependent assays (like highest diversity). 685 f = csv.writer(open(xPredictSupFile, "w") if isinstance(xPredictSupFile, str)
685 :type: List of strings 686 else xPredictSupFile, delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
686 :param lsBetaMetrics: List of beta metrics to use on beta metric dependent assays (like most representative). 687
687 :type: List of strings 688 lsAllSampleNames = abundanceTable.funcGetSampleNames()
688 :param lsInverseBetaMetrics: List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar). 689 lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames=lsOriginalSampleNames if fFromUpdate else lsAllSampleNames,
689 :type: List of strings 690 isPredictFile=False)
690 :param fRunDiversity: Run Diversity based methods (true indicates run). 691 dictLabels = dict([(sSample, sLabel) for sLabel in lsLabels.keys()
691 :type: Boolean 692 for sSample in lsLabels[sLabel]])
692 :param fRunRepresentative: Run Representative based methods (true indicates run). 693
693 :type: Boolean 694 # Dictionay keys will be used to order the predict file
694 :param fRunExtreme: Run Extreme based methods (true indicates run). 695 lsMeasurementKeys = dictltpleDistanceMeasurements.keys()
695 :type: Boolean 696 # Make header
696 :param istmBetaMatrix: File that has a precalculated beta matrix 697 f.writerow(["labels"]+lsMeasurementKeys)
697 :type: File stream or File path string 698
698 :return Selected Samples: Samples selected by methods. 699 # Reformat dictionary to make it easier to use
699 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} 700 for sKey in dictltpleDistanceMeasurements:
700 """ 701 dictltpleDistanceMeasurements[sKey] = dict(
701 702 [ltpl for ltpl in dictltpleDistanceMeasurements[sKey]])
702 #Sample ids/names 703
703 lsSampleNames = abndData.funcGetSampleNames() 704 for sSample in lsOriginalSampleNames:
704 705 # Make body of file
705 #Generate alpha metrics and get most diverse 706 f.writerow([dictLabels.get(sSample, ConstantsMicropita.c_sEmptyPredictFileValue)] +
706 if fRunDiversity: 707 [str(dictltpleDistanceMeasurements[sKey].get(sSample, ConstantsMicropita.c_sEmptyPredictFileValue))
707 708 for sKey in lsMeasurementKeys])
708 #Get Alpha metrics matrix 709
709 internalAlphaMatrix = None 710 def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics,
710 #Name of technique 711 fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None,
711 strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics 712 istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False):
712 713 """
713 #If given an alpha-diversity metadata 714 Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other
714 if strAlphaMetadata: 715 for the set that should be normalized.
715 internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]] 716
716 else: 717 :param abndData: Abundance table object holding the samples to be measured.
717 #Expects Observations (Taxa (row) x sample (column)) 718 :type: AbundanceTable
718 #Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]] 719 :param iSampleSelectionCount The number of samples to select per method.
719 internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy() 720 :type: Integer
720 if not abndData.funcIsSummed() 721 :param dictSelectedSamples Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}.
721 else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(), 722 :type: Dictionary
722 lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics) 723 :param lsAlphaMetrics: List of alpha metrics to use on alpha metric dependent assays (like highest diversity).
723 724 :type: List of strings
724 if internalAlphaMatrix: 725 :param lsBetaMetrics: List of beta metrics to use on beta metric dependent assays (like most representative).
725 #Invert measurments 726 :type: List of strings
726 if fInvertDiversity: 727 :param lsInverseBetaMetrics: List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar).
727 lldNewDiversity = [] 728 :type: List of strings
728 for lsLine in internalAlphaMatrix: 729 :param fRunDiversity: Run Diversity based methods (true indicates run).
729 lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine]) 730 :type: Boolean
730 internalAlphaMatrix = lldNewDiversity 731 :param fRunRepresentative: Run Representative based methods (true indicates run).
731 #Get top ranked alpha diversity by most diverse 732 :type: Boolean
732 #Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...] 733 :param fRunExtreme: Run Extreme based methods (true indicates run).
733 #Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]] 734 :type: Boolean
734 mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount) 735 :param istmBetaMatrix: File that has a precalculated beta matrix
735 736 :type: File stream or File path string
736 #Add to results 737 :return Selected Samples: Samples selected by methods.
737 for index in xrange(0,len(strMethod)): 738 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]}
738 strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index]) 739 """
739 dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index]) 740
740 741 # Sample ids/names
741 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b") 742 lsSampleNames = abndData.funcGetSampleNames()
742 logging.info(dictSelectedSamples) 743
743 744 # Generate alpha metrics and get most diverse
744 #Generate beta metrics and 745 if fRunDiversity:
745 if fRunRepresentative or fRunExtreme: 746
746 747 # Get Alpha metrics matrix
747 #Abundance matrix transposed 748 internalAlphaMatrix = None
748 npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True) 749 # Name of technique
749 750 strMethod = [
750 #Get center selection using clusters/tiling 751 strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics
751 #This will be for beta metrics in normalized space 752
752 if fRunRepresentative: 753 # If given an alpha-diversity metadata
753 754 if strAlphaMetadata:
754 if istmBetaMatrix: 755 internalAlphaMatrix = [
755 #Get representative dissimilarity samples 756 [float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]]
756 medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) 757 else:
757 758 # Expects Observations (Taxa (row) x sample (column))
758 if medoidSamples: 759 #Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
759 dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples) 760 internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance=abndData.funcGetAbundanceCopy()
760 else: 761 if not abndData.funcIsSummed()
761 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.") 762 else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(),
762 for bMetric in lsBetaMetrics: 763 lsSampleNames=lsSampleNames, lsDiversityMetricAlpha=lsAlphaMetrics)
763 764
764 #Get representative dissimilarity samples 765 if internalAlphaMatrix:
765 medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) 766 # Invert measurments
766 767 if fInvertDiversity:
767 if medoidSamples: 768 lldNewDiversity = []
768 dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples) 769 for lsLine in internalAlphaMatrix:
769 770 lldNewDiversity.append(
770 #Get extreme selection using clusters, tiling 771 [1/max(dValue, ConstantsMicropita.c_smallNumber) for dValue in lsLine])
771 if fRunExtreme: 772 internalAlphaMatrix = lldNewDiversity
772 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.") 773 # Get top ranked alpha diversity by most diverse
773 if istmBetaMatrix: 774 # Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...]
774 775 #Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]]
775 #Samples for representative dissimilarity 776 mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(
776 #This involves inverting the distance metric, 777 lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount)
777 #Taking the dendrogram level of where the number cluster == the number of samples to select 778
778 #Returning a repersentative sample from each cluster 779 # Add to results
779 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) 780 for index in xrange(0, len(strMethod)):
780 781 strSelectionMethod = self.dictConvertAMetricDiversity.get(
781 #Add selected samples 782 strMethod[index], ConstantsMicropita.c_strDiversity+"="+strMethod[index])
782 if extremeSamples: 783 dictSelectedSamples.setdefault(strSelectionMethod, []).extend(
783 dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples) 784 mostDiverseAlphaSamplesIndexes[index])
784 785
785 else: 786 logging.info(
786 #Run KMedoids with inverse custom distance metric in normalized space 787 "MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b")
787 for bMetric in lsInverseBetaMetrics: 788 logging.info(dictSelectedSamples)
788 789
789 #Samples for representative dissimilarity 790 # Generate beta metrics and
790 #This involves inverting the distance metric, 791 if fRunRepresentative or fRunExtreme:
791 #Taking the dendrogram level of where the number cluster == the number of samples to select 792
792 #Returning a repersentative sample from each cluster 793 # Abundance matrix transposed
793 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) 794 npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(
794 795 abndData.funcGetAbundanceCopy(), fRemoveAdornments=True)
795 #Add selected samples 796
796 if extremeSamples: 797 # Get center selection using clusters/tiling
797 dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples) 798 # This will be for beta metrics in normalized space
798 799 if fRunRepresentative:
799 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b") 800
800 logging.info(dictSelectedSamples) 801 if istmBetaMatrix:
801 return dictSelectedSamples 802 # Get representative dissimilarity samples
802 803 medoidSamples = self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames,
803 def funcRun(self, strIDName, strLastMetadataName, istmInput, 804 iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
804 ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput, 805
805 cDelimiter, cFeatureNameDelimiter, strFeatureSelection, 806 if medoidSamples:
806 istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None, 807 dictSelectedSamples.setdefault(
807 strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None, 808 ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom, []).extend(medoidSamples)
808 iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False): 809 else:
809 """ 810 logging.info(
810 Manages the selection of samples given different metrics. 811 "MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.")
811 812 for bMetric in lsBetaMetrics:
812 :param strIDName: Sample Id metadata row 813
813 :type: String 814 # Get representative dissimilarity samples
814 :param strLastMetadataName: The id of the metadata positioned last in the abundance table. 815 medoidSamples = self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames,
815 :type: String String metadata id. 816 iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
816 :param istmInput: File to store input data to supervised methods. 817
817 :type: FileStream of String file path 818 if medoidSamples:
818 :param ostmInputPredictFile: File to store distances from supervised methods. 819 dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(
819 :type: FileStream or String file path 820 bMetric, ConstantsMicropita.c_strRepresentative+"="+bMetric), []).extend(medoidSamples)
820 :param ostmCheckedFile: File to store the AbundanceTable data after it is being checked. 821
821 :type: FileStream or String file path 822 # Get extreme selection using clusters, tiling
822 :param ostmOutPut: File to store sample selection by methods of interest. 823 if fRunExtreme:
823 :type: FileStream or String file path 824 logging.info(
824 :param cDelimiter: Delimiter of abundance table. 825 "MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.")
825 :type: Character Char (default TAB). 826 if istmBetaMatrix:
826 :param cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades). 827
827 :type: Character (default |). 828 # Samples for representative dissimilarity
828 :param stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance). 829 # This involves inverting the distance metric,
829 :type: String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues). 830 # Taking the dendrogram level of where the number cluster == the number of samples to select
830 :param istmFeatures: File which holds the features of interest if using targeted feature methodology. 831 # Returning a repersentative sample from each cluster
831 :type: FileStream or String file path 832 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance,
832 :param iCount: Number of samples to select in each methods, supervised methods select this amount per label if possible. 833 lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
833 :type: Integer integer. 834
834 :param lstrMethods: List of strings indicating selection techniques. 835 # Add selected samples
835 :type: List of string method names 836 if extremeSamples:
836 :param strLabel: The metadata used for supervised labels. 837 dictSelectedSamples.setdefault(
837 :type: String 838 ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom, []).extend(extremeSamples)
838 :param strStratify: The metadata used to stratify unsupervised data. 839
839 :type: String 840 else:
840 :param strCustomAlpha: Custom alpha diversity metric 841 # Run KMedoids with inverse custom distance metric in normalized space
841 :type: String 842 for bMetric in lsInverseBetaMetrics:
842 :param strCustomBeta: Custom beta diversity metric 843
843 :type: String 844 # Samples for representative dissimilarity
844 :param strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling 845 # This involves inverting the distance metric,
845 :type: String 846 # Taking the dendrogram level of where the number cluster == the number of samples to select
846 :param istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling 847 # Returning a repersentative sample from each cluster
847 :type: FileStream or String file path 848 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames,
848 :param istrmTree: File containing tree for phylogentic beta-diversity analysis 849 iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
849 :type: FileStream or String file path 850
850 :param istrmEnvr: File containing environment for phylogentic beta-diversity analysis 851 # Add selected samples
851 :type: FileStream or String file path 852 if extremeSamples:
852 :param iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples. 853 dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(
853 :type: Integer 854 bMetric, ConstantsMicropita.c_strExtreme+"="+bMetric), []).extend(extremeSamples)
854 :param iMinSamples: Minimum sample count for the occurence filter. 855
855 :type: Integer 856 logging.info(
856 :param fInvertDiversity: When true will invert diversity measurements before using. 857 "MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b")
857 :type: boolean 858 logging.info(dictSelectedSamples)
858 :return Selected Samples: Samples selected by methods. 859 return dictSelectedSamples
859 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} 860
860 """ 861 def funcRun(self, strIDName, strLastMetadataName, istmInput,
861 862 ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput,
862 #Holds the top ranked samples from different metrics 863 cDelimiter, cFeatureNameDelimiter, strFeatureSelection,
863 #dict[metric name] = [samplename,samplename...] 864 istmFeatures, iCount, lstrMethods, strLastRowMetadata=None, strLabel=None, strStratify=None,
864 selectedSamples = dict() 865 strCustomAlpha=None, strCustomBeta=None, strAlphaMetadata=None, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None,
865 866 iMinSeqs=ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples=ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity=False):
866 #If a target feature file is given make sure that targeted feature is in the selection methods, if not add 867 """
867 if ConstantsMicropita.c_strFeature in lstrMethods: 868 Manages the selection of samples given different metrics.
868 if not istmFeatures: 869
869 logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.") 870 :param strIDName: Sample Id metadata row
870 return False 871 :type: String
871 872 :param strLastMetadataName: The id of the metadata positioned last in the abundance table.
872 #Diversity metrics to run 873 :type: String String metadata id.
873 #Use custom metrics if specified 874 :param istmInput: File to store input data to supervised methods.
874 #Custom beta metrics set to normalized only, custom alpha metrics set to count only 875 :type: FileStream of String file path
875 diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity] 876 :param ostmInputPredictFile: File to store distances from supervised methods.
876 diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity] 877 :type: FileStream or String file path
878 :param ostmCheckedFile: File to store the AbundanceTable data after it is being checked.
879 :type: FileStream or String file path
880 :param ostmOutPut: File to store sample selection by methods of interest.
881 :type: FileStream or String file path
882 :param cDelimiter: Delimiter of abundance table.
883 :type: Character Char (default TAB).
884 :param cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades).
885 :type: Character (default |).
886 :param stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance).
887 :type: String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues).
888 :param istmFeatures: File which holds the features of interest if using targeted feature methodology.
889 :type: FileStream or String file path
890 :param iCount: Number of samples to select in each methods, supervised methods select this amount per label if possible.
891 :type: Integer integer.
892 :param lstrMethods: List of strings indicating selection techniques.
893 :type: List of string method names
894 :param strLabel: The metadata used for supervised labels.
895 :type: String
896 :param strStratify: The metadata used to stratify unsupervised data.
897 :type: String
898 :param strCustomAlpha: Custom alpha diversity metric
899 :type: String
900 :param strCustomBeta: Custom beta diversity metric
901 :type: String
902 :param strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling
903 :type: String
904 :param istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling
905 :type: FileStream or String file path
906 :param istrmTree: File containing tree for phylogentic beta-diversity analysis
907 :type: FileStream or String file path
908 :param istrmEnvr: File containing environment for phylogentic beta-diversity analysis
909 :type: FileStream or String file path
910 :param iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples.
911 :type: Integer
912 :param iMinSamples: Minimum sample count for the occurence filter.
913 :type: Integer
914 :param fInvertDiversity: When true will invert diversity measurements before using.
915 :type: boolean
916 :return Selected Samples: Samples selected by methods.
917 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]}
918 """
919
920 # Holds the top ranked samples from different metrics
921 # dict[metric name] = [samplename,samplename...]
922 selectedSamples = dict()
923
924 # If a target feature file is given make sure that targeted feature is in the selection methods, if not add
925 if ConstantsMicropita.c_strFeature in lstrMethods:
926 if not istmFeatures:
927 logging.error(
928 "MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.")
929 return False
930
931 # Diversity metrics to run
932 # Use custom metrics if specified
933 # Custom beta metrics set to normalized only, custom alpha metrics set to count only
934 diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [
935 MicroPITA.c_strInverseSimpsonDiversity]
936 diversityMetricsBeta = [] if istmBetaMatrix else [
937 strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity]
877 # inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity] 938 # inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity]
878 diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else [] 939 diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [
879 diversityMetricsBetaNoNormalize = [] 940 strCustomAlpha] if strCustomAlpha else []
941 diversityMetricsBetaNoNormalize = []
880 # inverseDiversityMetricsBetaNoNormalize = [] 942 # inverseDiversityMetricsBetaNoNormalize = []
881 943
882 #Targeted taxa 944 # Targeted taxa
883 userDefinedTaxa = [] 945 userDefinedTaxa = []
884 946
885 #Perform different flows flags 947 # Perform different flows flags
886 c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods 948 c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods
887 c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods 949 c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods
888 c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods 950 c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods
889 c_RUN_RANK_AVERAGE_USER_4 = False 951 c_RUN_RANK_AVERAGE_USER_4 = False
890 if ConstantsMicropita.c_strFeature in lstrMethods: 952 if ConstantsMicropita.c_strFeature in lstrMethods:
891 c_RUN_RANK_AVERAGE_USER_4 = True 953 c_RUN_RANK_AVERAGE_USER_4 = True
892 if not istmFeatures: 954 if not istmFeatures:
893 logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.") 955 logging.error(
894 return False 956 "MicroPITA.funcRun:: No taxa file was given for taxa selection.")
895 #Read in taxa list, break down to lines and filter out empty strings 957 return False
896 userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines())) 958 # Read in taxa list, break down to lines and filter out empty strings
897 c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods 959 userDefinedTaxa = filter(None, (s.strip()
898 c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods 960 for s in istmFeatures.readlines()))
899 c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods 961 c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods
900 962 c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods
901 #Read in abundance data 963 c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods
902 #Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0 964
903 #Abundance table object to read in and manage data 965 # Read in abundance data
904 totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples], 966 # Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0
905 cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata, 967 # Abundance table object to read in and manage data
906 sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile) 968 totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter=[iMinSeqs, iMinSamples],
907 if not totalAbundanceTable: 969 cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata,
908 logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+ 970 sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile)
909 " This often occurs when the Last Metadata is not specified correctly."+ 971 if not totalAbundanceTable:
910 " Please check to make sure the Last Metadata selection is the row of the last metadata,"+ 972 logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed." +
911 " all values after this selection should be microbial measurements and should be numeric.") 973 " This often occurs when the Last Metadata is not specified correctly." +
912 return False 974 " Please check to make sure the Last Metadata selection is the row of the last metadata," +
913 975 " all values after this selection should be microbial measurements and should be numeric.")
914 lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel 976 return False
915 977
916 dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy() 978 lsOriginalLabels = SVM.funcMakeLabels(
917 logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata)) 979 totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel
918 #If there is only 1 unique value for the labels, do not run the Supervised methods 980
919 if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ): 981 dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy()
920 logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[]))) 982 logging.debug("MicroPITA.funcRun:: Received metadata=" +
921 return False 983 str(dictTotalMetadata))
922 984 # If there is only 1 unique value for the labels, do not run the Supervised methods
923 #Run unsupervised methods### 985 if strLabel and (len(set(dictTotalMetadata.get(strLabel, []))) < 2):
924 #Stratify the data if need be and drop the old data 986 logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" +
925 lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable] 987 str(dictTotalMetadata.get(strLabel, [])))
926 988 return False
927 #For each stratified abundance block or for the unstratfified abundance 989
928 #Run the unsupervised blocks 990 #Run unsupervised methods###
929 fAppendSupFiles = False 991 # Stratify the data if need be and drop the old data
930 for stratAbundanceTable in lStratifiedAbundanceTables: 992 lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(
931 logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName()) 993 strStratify) if strStratify else [totalAbundanceTable]
932 994
933 ###NOT SUMMED, NOT NORMALIZED 995 # For each stratified abundance block or for the unstratfified abundance
934 #Only perform if the data is not yet normalized 996 # Run the unsupervised blocks
935 if not stratAbundanceTable.funcIsNormalized( ): 997 fAppendSupFiles = False
936 #Need to first work with unnormalized data 998 for stratAbundanceTable in lStratifiedAbundanceTables:
937 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: 999 logging.info("MicroPITA.funcRun:: Running abundance block:" +
938 1000 stratAbundanceTable.funcGetName())
939 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, 1001
940 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize, 1002 # NOT SUMMED, NOT NORMALIZED
941 lsBetaMetrics=diversityMetricsBetaNoNormalize, 1003 # Only perform if the data is not yet normalized
942 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize, 1004 if not stratAbundanceTable.funcIsNormalized():
943 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, 1005 # Need to first work with unnormalized data
944 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata, 1006 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
945 istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) 1007
946 1008 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
947 1009 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize,
948 #Generate selection by the rank average of user defined taxa 1010 lsBetaMetrics=diversityMetricsBetaNoNormalize,
949 #Expects (Taxa (row) by Samples (column)) 1011 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize,
950 #Expects a column 0 of taxa id that is skipped 1012 fRunDiversity=c_RUN_MAX_DIVERSITY_1, fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
951 #Returns [(sample name,average,rank)] 1013 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata,
952 #SUMMED AND NORMALIZED 1014 istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
953 stratAbundanceTable.funcSumClades() 1015
954 #Normalize data at this point 1016 # Generate selection by the rank average of user defined taxa
955 stratAbundanceTable.funcNormalize() 1017 # Expects (Taxa (row) by Samples (column))
956 if c_RUN_RANK_AVERAGE_USER_4: 1018 # Expects a column 0 of taxa id that is skipped
957 selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable, 1019 # Returns [(sample name,average,rank)]
958 lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection) 1020 # SUMMED AND NORMALIZED
959 logging.info("MicroPITA.funcRun:: Selected Samples Rank") 1021 stratAbundanceTable.funcSumClades()
960 logging.info(selectedSamples) 1022 # Normalize data at this point
961 1023 stratAbundanceTable.funcNormalize()
962 ###SUMMED AND NORMALIZED analysis block 1024 if c_RUN_RANK_AVERAGE_USER_4:
963 #Diversity based metric will move reduce to terminal taxa as needed 1025 selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable,
964 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: 1026 lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection)
965 1027 logging.info("MicroPITA.funcRun:: Selected Samples Rank")
966 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, 1028 logging.info(selectedSamples)
967 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha, 1029
968 lsBetaMetrics=diversityMetricsBeta, 1030 # SUMMED AND NORMALIZED analysis block
969 lsInverseBetaMetrics=diversityMetricsBeta, 1031 # Diversity based metric will move reduce to terminal taxa as needed
970 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, 1032 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
971 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, 1033
972 istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) 1034 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
973 1035 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha,
974 #5::Select randomly 1036 lsBetaMetrics=diversityMetricsBeta,
975 #Expects sampleNames = List of sample names [name, name, name...] 1037 lsInverseBetaMetrics=diversityMetricsBeta,
976 if(c_RUN_RANDOM_5): 1038 fRunDiversity=c_RUN_MAX_DIVERSITY_1, fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
977 #Select randomly from sample names 1039 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3,
978 selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount) 1040 istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
979 logging.info("MicroPITA.funcRun:: Selected Samples Random") 1041
980 logging.info(selectedSamples) 1042 # 5::Select randomly
981 1043 # Expects sampleNames = List of sample names [name, name, name...]
982 #Perform supervised selection 1044 if(c_RUN_RANDOM_5):
983 if c_RUN_DISTINCT or c_RUN_DISCRIMINANT: 1045 # Select randomly from sample names
984 if strLabel: 1046 selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(
985 dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable, 1047 lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount)
986 fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT, 1048 logging.info("MicroPITA.funcRun:: Selected Samples Random")
987 xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile, 1049 logging.info(selectedSamples)
988 strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount, 1050
989 lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(), 1051 # Perform supervised selection
990 lsOriginalLabels = lsOriginalLabels, 1052 if c_RUN_DISTINCT or c_RUN_DISCRIMINANT:
991 fAppendFiles=fAppendSupFiles) 1053 if strLabel:
992 1054 dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable,
993 [selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()] 1055 fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT,
994 1056 xOutputSupFile=ostmInputPredictFile, xPredictSupFile=ostmPredictFile,
995 if not fAppendSupFiles: 1057 strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount,
996 fAppendSupFiles = True 1058 lsOriginalSampleNames=totalAbundanceTable.funcGetSampleNames(),
997 logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised") 1059 lsOriginalLabels=lsOriginalLabels,
998 logging.info(selectedSamples) 1060 fAppendFiles=fAppendSupFiles)
999 return selectedSamples 1061
1000 1062 [selectedSamples.setdefault(sKey, []).extend(
1001 #Testing: Happy path tested 1063 lValue) for sKey, lValue in dictSelectionRet.items()]
1002 @staticmethod 1064
1003 def funcWriteSelectionToFile(dictSelection,xOutputFilePath): 1065 if not fAppendSupFiles:
1004 """ 1066 fAppendSupFiles = True
1005 Writes the selection of samples by method to an output file. 1067 logging.info(
1006 1068 "MicroPITA.funcRun:: Selected Samples Unsupervised")
1007 :param dictSelection: The dictionary of selections by method to be written to a file. 1069 logging.info(selectedSamples)
1008 :type: Dictionary The dictionary of selections by method {"method":["sample selected","sample selected"...]} 1070 return selectedSamples
1009 :param xOutputFilePath: FileStream or String path to file inwhich the dictionary is written. 1071
1010 :type: String FileStream or String path to file 1072 # Testing: Happy path tested
1011 """ 1073 @staticmethod
1012 1074 def funcWriteSelectionToFile(dictSelection, xOutputFilePath):
1013 if not dictSelection: 1075 """
1014 return 1076 Writes the selection of samples by method to an output file.
1015 1077
1016 #Open file 1078 :param dictSelection: The dictionary of selections by method to be written to a file.
1017 f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim ) 1079 :type: Dictionary The dictionary of selections by method {"method":["sample selected","sample selected"...]}
1018 1080 :param xOutputFilePath: FileStream or String path to file inwhich the dictionary is written.
1019 #Create output content from dictionary 1081 :type: String FileStream or String path to file
1020 for sKey in dictSelection: 1082 """
1021 f.writerow([sKey]+dictSelection[sKey]) 1083
1022 logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey])) 1084 if not dictSelection:
1023 1085 return
1024 #Testing: Happy Path tested 1086
1025 @staticmethod 1087 # Open file
1026 def funcReadSelectionFileToDictionary(xInputFile): 1088 f = csv.writer(open(xOutputFilePath, "w") if isinstance(
1027 """ 1089 xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim)
1028 Reads in an output selection file from micropita and formats it into a dictionary. 1090
1029 1091 # Create output content from dictionary
1030 :param xInputFile: String path to file or file stream to read and translate into a dictionary. 1092 for sKey in dictSelection:
1031 {"method":["sample selected","sample selected"...]} 1093 f.writerow([sKey]+dictSelection[sKey])
1032 :type: FileStream or String Path to file 1094 logging.debug(
1033 :return Dictionary: Samples selected by methods. 1095 "MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey]))
1034 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} 1096
1035 """ 1097 # Testing: Happy Path tested
1036 1098 @staticmethod
1037 #Open file 1099 def funcReadSelectionFileToDictionary(xInputFile):
1038 istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim) 1100 """
1039 1101 Reads in an output selection file from micropita and formats it into a dictionary.
1040 #Dictionary to hold selection data 1102
1041 return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader]) 1103 :param xInputFile: String path to file or file stream to read and translate into a dictionary.
1042 1104 {"method":["sample selected","sample selected"...]}
1043 #Set up arguments reader 1105 :type: FileStream or String Path to file
1044 argp = argparse.ArgumentParser( prog = "MicroPITA.py", 1106 :return Dictionary: Samples selected by methods.
1045 description = """Selects samples from abundance tables based on various selection schemes.""" ) 1107 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]}
1046 1108 """
1047 args = argp.add_argument_group( "Common", "Commonly modified options" ) 1109
1048 args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp) 1110 # Open file
1049 args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp, 1111 istmReader = csv.reader(open(xInputFile, 'r') if isinstance(
1050 choices = ConstantsMicropita.c_lsAllMethods, action = "append") 1112 xInputFile, str) else xInputFile, delimiter=ConstantsMicropita.c_outputFileDelim)
1051 1113
1052 args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" ) 1114 # Dictionary to hold selection data
1053 args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp, choices = Metric.setAlphaDiversities) 1115 return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader])
1054 args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp, choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]) 1116
1055 args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp) 1117
1056 args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp) 1118 # Set up arguments reader
1057 args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp) 1119 argp = argparse.ArgumentParser(prog="MicroPITA.py",
1058 args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp) 1120 description="""Selects samples from abundance tables based on various selection schemes.""")
1059 args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp) 1121
1060 1122 args = argp.add_argument_group("Common", "Commonly modified options")
1061 args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" ) 1123 args.add_argument(ConstantsMicropita.c_strCountArgument, "--num", dest="iCount",
1062 args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp) 1124 metavar="samples", default=10, type=int, help=ConstantsMicropita.c_strCountHelp)
1063 args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None, 1125 args.add_argument("-m", "--method", dest="lstrMethods", metavar="method", default=[], help=ConstantsMicropita.c_strSelectionTechniquesHelp,
1064 help= ConstantsMicropita.c_strLastMetadataNameHelp) 1126 choices=ConstantsMicropita.c_lsAllMethods, action="append")
1065 args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0], 1127
1066 choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp) 1128 args = argp.add_argument_group(
1067 args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp) 1129 "Custom", "Selecting and inputing custom metrics")
1068 args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp) 1130 args.add_argument("-a", "--alpha", dest="strAlphaDiversity", metavar="AlphaDiversity", default=None,
1069 1131 help=ConstantsMicropita.c_strCustomAlphaDiversityHelp, choices=Metric.setAlphaDiversities)
1070 args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" ) 1132 args.add_argument("-b", "--beta", dest="strBetaDiversity", metavar="BetaDiversity", default=None, help=ConstantsMicropita.c_strCustomBetaDiversityHelp,
1071 args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp) 1133 choices=list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted])
1072 args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id", 1134 args.add_argument("-q", "--alphameta", dest="strAlphaMetadata", metavar="AlphaDiversityMetadata",
1073 help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp) 1135 default=None, help=ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp)
1074 1136 args.add_argument("-x", "--betamatrix", dest="istmBetaMatrix", metavar="BetaDiversityMatrix",
1075 args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" ) 1137 default=None, help=ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp)
1076 args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp) 1138 args.add_argument("-o", "--tree", dest="istrmTree", metavar="PhylogeneticTree",
1077 args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp) 1139 default=None, help=ConstantsMicropita.c_strCustomPhylogeneticTreeHelp)
1078 1140 args.add_argument("-i", "--envr", dest="istrmEnvr", metavar="EnvironmentFile",
1079 args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" ) 1141 default=None, help=ConstantsMicropita.c_strCustomEnvironmentFileHelp)
1080 args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING", 1142 args.add_argument("-f", "--invertDiversity", dest="fInvertDiversity", action="store_true",
1081 choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp) 1143 default=False, help=ConstantsMicropita.c_strInvertDiversityHelp)
1082 args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp) 1144
1083 args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp) 1145 args = argp.add_argument_group(
1084 args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp) 1146 "Miscellaneous", "Row/column identifiers and feature targeting options")
1085 args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp) 1147 args.add_argument("-d", ConstantsMicropita.c_strIDNameArgument, dest="strIDName",
1086 1148 metavar="sample_id", help=ConstantsMicropita.c_strIDNameHelp)
1087 argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp, 1149 args.add_argument("-l", ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar="metadata_id", default=None,
1088 default = sys.stdin) 1150 help=ConstantsMicropita.c_strLastMetadataNameHelp)
1089 argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp, 1151 args.add_argument("-r", ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0],
1090 default = sys.stdout) 1152 choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help=ConstantsMicropita.c_strTargetedFeatureMethodHelp)
1091 1153 args.add_argument("-t", ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures",
1092 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__ 1154 metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp)
1093 1155 args.add_argument("-w", ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata",
1094 def _main( ): 1156 metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp)
1095 args = argp.parse_args( ) 1157
1096 1158 args = argp.add_argument_group(
1097 #Set up logger 1159 "Data labeling", "Metadata IDs for strata and supervised label values")
1098 iLogLevel = getattr(logging, args.strLogLevel.upper(), None) 1160 args.add_argument("-e", ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel",
1099 logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel) 1161 metavar="supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp)
1100 1162 args.add_argument("-s", ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id",
1101 #Run micropita 1163 help=ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp)
1102 logging.info("MicroPITA:: Start microPITA") 1164
1103 microPITA = MicroPITA() 1165 args = argp.add_argument_group(
1104 1166 "File formatting", "Rarely modified file formatting options")
1105 #Argparse will append to the default but will not remove the default so I do this here 1167 args.add_argument("-j", ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter",
1106 if not len(args.lstrMethods): 1168 metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp)
1107 args.lstrMethods = [ConstantsMicropita.c_strRepresentative] 1169 args.add_argument("-k", ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter",
1108 1170 metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp)
1109 dictSelectedSamples = microPITA.funcRun( 1171
1110 strIDName = args.strIDName, 1172 args = argp.add_argument_group(
1111 strLastMetadataName = args.strLastMetadataName, 1173 "Debugging", "Debugging options - modify at your own risk!")
1112 istmInput = args.istmInput, 1174 args.add_argument("-v", ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar="log_level", default="WARNING",
1113 ostmInputPredictFile = args.ostmInputPredictFile, 1175 choices=ConstantsMicropita.c_lsLoggingChoices, help=ConstantsMicropita.c_strLoggingHelp)
1114 ostmPredictFile = args.ostmPredictFile, 1176 args.add_argument("-c", ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile",
1115 ostmCheckedFile = args.ostmCheckedFile, 1177 metavar="output_qc", type=argparse.FileType("w"), help=ConstantsMicropita.c_strCheckedAbundanceFileHelp)
1116 ostmOutput = args.ostmOutput, 1178 args.add_argument("-g", ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile",
1117 cDelimiter = args.cFileDelimiter, 1179 metavar="output_log", type=argparse.FileType("w"), help=ConstantsMicropita.c_strLoggingFileHelp)
1118 cFeatureNameDelimiter = args.cFeatureNameDelimiter, 1180 args.add_argument("-u", ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile",
1119 istmFeatures = args.istmFeatures, 1181 metavar="output_scaled", type=argparse.FileType("w"), help=ConstantsMicropita.c_strSupervisedInputFileHelp)
1120 strFeatureSelection = args.strFeatureSelection, 1182 args.add_argument("-p", ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile",
1121 iCount = args.iCount, 1183 metavar="output_labels", type=argparse.FileType("w"), help=ConstantsMicropita.c_strSupervisedPredictedFileHelp)
1122 strLastRowMetadata = args.strLastFeatureMetadata, 1184
1123 strLabel = args.strLabel, 1185 argp.add_argument("istmInput", metavar="input.pcl/biome", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strAbundanceFileHelp,
1124 strStratify = args.strUnsupervisedStratify, 1186 default=sys.stdin)
1125 strCustomAlpha = args.strAlphaDiversity, 1187 argp.add_argument("ostmOutput", metavar="output.txt", type=argparse.FileType("w"), help=ConstantsMicropita.c_strGenericOutputDataFileHelp,
1126 strCustomBeta = args.strBetaDiversity, 1188 default=sys.stdout)
1127 strAlphaMetadata = args.strAlphaMetadata, 1189
1128 istmBetaMatrix = args.istmBetaMatrix, 1190 __doc__ = "::\n\n\t" + argp.format_help().replace("\n", "\n\t") + __doc__
1129 istrmTree = args.istrmTree, 1191
1130 istrmEnvr = args.istrmEnvr, 1192
1131 lstrMethods = args.lstrMethods, 1193 def _main():
1132 fInvertDiversity = args.fInvertDiversity 1194 args = argp.parse_args()
1133 ) 1195
1134 1196 # Set up logger
1135 if not dictSelectedSamples: 1197 iLogLevel = getattr(logging, args.strLogLevel.upper(), None)
1136 logging.error("MicroPITA:: Error, did not get a result from analysis.") 1198 logging.basicConfig(
1137 return -1 1199 stream=args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode='w', level=iLogLevel)
1138 logging.info("End microPITA") 1200
1139 1201 # Run micropita
1140 #Log output for debugging 1202 logging.info("MicroPITA:: Start microPITA")
1141 logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples)) 1203 microPITA = MicroPITA()
1142 1204
1143 #Write selection to file 1205 # Argparse will append to the default but will not remove the default so I do this here
1144 microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput) 1206 if not len(args.lstrMethods):
1207 args.lstrMethods = [ConstantsMicropita.c_strRepresentative]
1208
1209 dictSelectedSamples = microPITA.funcRun(
1210 strIDName=args.strIDName,
1211 strLastMetadataName=args.strLastMetadataName,
1212 istmInput=args.istmInput,
1213 ostmInputPredictFile=args.ostmInputPredictFile,
1214 ostmPredictFile=args.ostmPredictFile,
1215 ostmCheckedFile=args.ostmCheckedFile,
1216 ostmOutput=args.ostmOutput,
1217 cDelimiter=args.cFileDelimiter,
1218 cFeatureNameDelimiter=args.cFeatureNameDelimiter,
1219 istmFeatures=args.istmFeatures,
1220 strFeatureSelection=args.strFeatureSelection,
1221 iCount=args.iCount,
1222 strLastRowMetadata=args.strLastFeatureMetadata,
1223 strLabel=args.strLabel,
1224 strStratify=args.strUnsupervisedStratify,
1225 strCustomAlpha=args.strAlphaDiversity,
1226 strCustomBeta=args.strBetaDiversity,
1227 strAlphaMetadata=args.strAlphaMetadata,
1228 istmBetaMatrix=args.istmBetaMatrix,
1229 istrmTree=args.istrmTree,
1230 istrmEnvr=args.istrmEnvr,
1231 lstrMethods=args.lstrMethods,
1232 fInvertDiversity=args.fInvertDiversity
1233 )
1234
1235 if not dictSelectedSamples:
1236 logging.error("MicroPITA:: Error, did not get a result from analysis.")
1237 return -1
1238 logging.info("End microPITA")
1239
1240 # Log output for debugging
1241 logging.debug("MicroPITA:: Returned the following samples:" +
1242 str(dictSelectedSamples))
1243
1244 # Write selection to file
1245 microPITA.funcWriteSelectionToFile(
1246 dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput)
1247
1145 1248
1146 if __name__ == "__main__": 1249 if __name__ == "__main__":
1147 _main( ) 1250 _main()