Mercurial > repos > george-weingart > micropita
comparison MicroPITA.py @ 28:1d09ffab87a7 draft
Uploaded MicroPITA.py - fixed spaces and tabs inconsistencies
author | george-weingart |
---|---|
date | Tue, 22 Jun 2021 03:23:17 +0000 |
parents | 7d25ecd225dd |
children |
comparison
equal
deleted
inserted
replaced
27:d9862a9a4d84 | 28:1d09ffab87a7 |
---|---|
3 Author: Timothy Tickle | 3 Author: Timothy Tickle |
4 Description: Class to Run analysis for the microPITA paper | 4 Description: Class to Run analysis for the microPITA paper |
5 """ | 5 """ |
6 | 6 |
7 ##################################################################################### | 7 ##################################################################################### |
8 #Copyright (C) <2012> | 8 # Copyright (C) <2012> |
9 # | 9 # |
10 #Permission is hereby granted, free of charge, to any person obtaining a copy of | 10 # Permission is hereby granted, free of charge, to any person obtaining a copy of |
11 #this software and associated documentation files (the "Software"), to deal in the | 11 # this software and associated documentation files (the "Software"), to deal in the |
12 #Software without restriction, including without limitation the rights to use, copy, | 12 # Software without restriction, including without limitation the rights to use, copy, |
13 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | 13 # modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, |
14 #and to permit persons to whom the Software is furnished to do so, subject to | 14 # and to permit persons to whom the Software is furnished to do so, subject to |
15 #the following conditions: | 15 # the following conditions: |
16 # | 16 # |
17 #The above copyright notice and this permission notice shall be included in all copies | 17 # The above copyright notice and this permission notice shall be included in all copies |
18 #or substantial portions of the Software. | 18 # or substantial portions of the Software. |
19 # | 19 # |
20 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | 20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, |
21 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | 21 # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A |
22 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | 22 # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
23 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | 23 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION |
24 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | 24 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
25 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | 25 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
26 ##################################################################################### | 26 ##################################################################################### |
27 | 27 |
28 from types import * | |
29 import scipy.spatial.distance | |
30 import scipy.cluster.hierarchy as hcluster | |
31 import random | |
32 import os | |
33 import operator | |
34 import numpy as np | |
35 import mlpy | |
36 import math | |
37 import logging | |
38 import csv | |
39 from src.ConstantsMicropita import ConstantsMicropita | |
40 from src.breadcrumbs.src.UtilityMath import UtilityMath | |
41 from src.breadcrumbs.src.SVM import SVM | |
42 from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor | |
43 from src.breadcrumbs.src.KMedoids import Kmedoids | |
44 from src.breadcrumbs.src.Metric import Metric | |
45 from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs | |
46 from src.breadcrumbs.src.AbundanceTable import AbundanceTable | |
28 __author__ = "Timothy Tickle" | 47 __author__ = "Timothy Tickle" |
29 __copyright__ = "Copyright 2012" | 48 __copyright__ = "Copyright 2012" |
30 __credits__ = ["Timothy Tickle"] | 49 __credits__ = ["Timothy Tickle"] |
31 __license__ = "MIT" | 50 __license__ = "MIT" |
32 __maintainer__ = "Timothy Tickle" | 51 __maintainer__ = "Timothy Tickle" |
33 __email__ = "ttickle@sph.harvard.edu" | 52 __email__ = "ttickle@sph.harvard.edu" |
34 __status__ = "Development" | 53 __status__ = "Development" |
35 | 54 |
36 import sys | 55 import sys |
37 import argparse | 56 import argparse |
38 from src.breadcrumbs.src.AbundanceTable import AbundanceTable | |
39 import warnings | 57 import warnings |
40 warnings.simplefilter(action = "ignore", category = FutureWarning) | 58 warnings.simplefilter(action="ignore", category=FutureWarning) |
41 from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs | 59 |
42 from src.breadcrumbs.src.Metric import Metric | |
43 from src.breadcrumbs.src.KMedoids import Kmedoids | |
44 from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor | |
45 from src.breadcrumbs.src.SVM import SVM | |
46 from src.breadcrumbs.src.UtilityMath import UtilityMath | |
47 | |
48 from src.ConstantsMicropita import ConstantsMicropita | |
49 import csv | |
50 import logging | |
51 import math | |
52 import mlpy | |
53 import numpy as np | |
54 import operator | |
55 import os | |
56 import random | |
57 import scipy.cluster.hierarchy as hcluster | |
58 import scipy.spatial.distance | |
59 from types import * | |
60 | 60 |
61 class MicroPITA: | 61 class MicroPITA: |
62 """ | 62 """ |
63 Selects samples from a first tier of a multi-tiered study to be used in a second tier. | 63 Selects samples from a first tier of a multi-tiered study to be used in a second tier. |
64 Different methods can be used for selection. | 64 Different methods can be used for selection. |
65 The expected input is an abundance table (and potentially a text file of targeted features, | 65 The expected input is an abundance table (and potentially a text file of targeted features, |
66 if using the targeted features option). Output is a list of samples exhibiting the | 66 if using the targeted features option). Output is a list of samples exhibiting the |
67 characteristics of interest. | 67 characteristics of interest. |
68 """ | 68 """ |
69 | 69 |
70 #Constants | 70 # Constants |
71 #Diversity metrics Alpha | 71 # Diversity metrics Alpha |
72 c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity | 72 c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity |
73 c_strChao1Diversity = Metric.c_strChao1Diversity | 73 c_strChao1Diversity = Metric.c_strChao1Diversity |
74 | 74 |
75 #Diversity metrics Beta | 75 # Diversity metrics Beta |
76 c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity | 76 c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity |
77 | 77 |
78 #Additive inverses of diversity metrics beta | 78 # Additive inverses of diversity metrics beta |
79 c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity | 79 c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity |
80 | 80 |
81 #Technique Names | 81 # Technique Names |
82 ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C" | 82 ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C" |
83 | 83 |
84 #Targeted feature settings | 84 # Targeted feature settings |
85 c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked | 85 c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked |
86 c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance | 86 c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance |
87 | 87 |
88 #Technique groupings | 88 # Technique groupings |
89 # c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2] | 89 # c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2] |
90 | 90 |
91 #Converts ecology metrics into standardized method selection names | 91 # Converts ecology metrics into standardized method selection names |
92 dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2} | 92 dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity: ConstantsMicropita.c_strDiversity, |
93 c_strChao1Diversity: ConstantsMicropita.c_strDiversity2} | |
93 # dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity} | 94 # dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity} |
94 dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative} | 95 dictConvertBMetricToMethod = { |
95 dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme} | 96 c_strBrayCurtisDissimilarity: ConstantsMicropita.c_strRepresentative} |
96 | 97 dictConvertInvBMetricToMethod = { |
97 #Linkage used in the Hierarchical clustering | 98 c_strBrayCurtisDissimilarity: ConstantsMicropita.c_strExtreme} |
98 c_strHierarchicalClusterMethod = 'average' | 99 |
99 | 100 # Linkage used in the Hierarchical clustering |
100 ####Group 1## Diversity | 101 c_strHierarchicalClusterMethod = 'average' |
101 #Testing: Happy path Testing (8) | 102 |
102 def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None): | 103 # Group 1## Diversity |
103 """ | 104 # Testing: Happy path Testing (8) |
104 Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given | 105 def funcGetTopRankedSamples(self, lldMatrix=None, lsSampleNames=None, iTopAmount=None): |
105 it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample | 106 """ |
106 names associated with the indices. | 107 Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given |
107 | 108 it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample |
108 :param lldMatrix: List of lists [[value,value,value,value],[value,value,value,value]]. | 109 names associated with the indices. |
109 :type: List of lists List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample. | 110 |
110 :param lsSampleNames: List of sample names positionally related (the same) to each list (Optional). | 111 :param lldMatrix: List of lists [[value,value,value,value],[value,value,value,value]]. |
111 :type: List of strings List of strings. | 112 :type: List of lists List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample. |
112 :param iTopAmount: The amount of top measured samples (assumes the higher measurements are better). | 113 :param lsSampleNames: List of sample names positionally related (the same) to each list (Optional). |
113 :type: integer Integer amount of sample names/ indices to return. | 114 :type: List of strings List of strings. |
114 :return List: List of samples to be selected. | 115 :param iTopAmount: The amount of top measured samples (assumes the higher measurements are better). |
115 """ | 116 :type: integer Integer amount of sample names/ indices to return. |
116 topRankListRet = [] | 117 :return List: List of samples to be selected. |
117 for rowMetrics in lldMatrix: | 118 """ |
118 #Create 2 d array to hold value and index and sort | 119 topRankListRet = [] |
119 liIndexX = [rowMetrics,range(len(rowMetrics))] | 120 for rowMetrics in lldMatrix: |
120 liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True) | 121 # Create 2 d array to hold value and index and sort |
121 | 122 liIndexX = [rowMetrics, range(len(rowMetrics))] |
122 if lsSampleNames: | 123 liIndexX[1].sort(key=liIndexX[0].__getitem__, reverse=True) |
123 topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]]) | 124 |
124 else: | 125 if lsSampleNames: |
125 topRankListRet.append(liIndexX[1][:iTopAmount]) | 126 topRankListRet.append([lsSampleNames[iIndex] |
126 | 127 for iIndex in liIndexX[1][:iTopAmount]]) |
127 return topRankListRet | 128 else: |
128 | 129 topRankListRet.append(liIndexX[1][:iTopAmount]) |
129 ####Group 2## Representative Dissimilarity | 130 |
130 #Testing: Happy path tested 1 | 131 return topRankListRet |
131 def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): | 132 |
132 """ | 133 # Group 2## Representative Dissimilarity |
133 Gets centroid samples by k-medoids clustering of a given matrix. | 134 # Testing: Happy path tested 1 |
134 | 135 def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): |
135 :param npaMatrix: Numpy array where row=features and columns=samples | 136 """ |
136 :type: Numpy array Abundance Data. | 137 Gets centroid samples by k-medoids clustering of a given matrix. |
137 :param sMetric: String name of beta metric used as the distance metric. | 138 |
138 :type: String String name of beta metric. | 139 :param npaMatrix: Numpy array where row=features and columns=samples |
139 :param lsSampleNames: The names of the sample | 140 :type: Numpy array Abundance Data. |
140 :type: List List of strings | 141 :param sMetric: String name of beta metric used as the distance metric. |
141 :param iNumberSamplesReturned: Number of samples to return, each will be a centroid of a sample. | 142 :type: String String name of beta metric. |
142 :type: Integer Number of samples to return | 143 :param lsSampleNames: The names of the sample |
143 :return List: List of selected samples. | 144 :type: List List of strings |
144 :param istmBetaMatrix: File with beta-diversity matrix | 145 :param iNumberSamplesReturned: Number of samples to return, each will be a centroid of a sample. |
145 :type: File stream or file path string | 146 :type: Integer Number of samples to return |
146 """ | 147 :return List: List of selected samples. |
147 | 148 :param istmBetaMatrix: File with beta-diversity matrix |
148 #Count of how many rows | 149 :type: File stream or file path string |
149 sampleCount = npaMatrix.shape[0] | 150 """ |
150 if iNumberSamplesReturned > sampleCount: | 151 |
151 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".") | 152 # Count of how many rows |
152 return False | 153 sampleCount = npaMatrix.shape[0] |
153 | 154 if iNumberSamplesReturned > sampleCount: |
154 #If the cluster count is equal to the sample count return all samples | 155 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = " + |
155 if sampleCount == iNumberSamplesReturned: | 156 str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".") |
156 return list(lsSampleNames) | 157 return False |
157 | 158 |
158 #Get distance matrix | 159 # If the cluster count is equal to the sample count return all samples |
159 distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames) | 160 if sampleCount == iNumberSamplesReturned: |
160 if type(distanceMatrix) is BooleanType: | 161 return list(lsSampleNames) |
161 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.") | 162 |
162 return False | 163 # Get distance matrix |
163 | 164 distanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix, lsSampleOrder=lsSampleNames)[ |
164 # Handle unifrac output | 165 0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames) |
165 if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: | 166 if type(distanceMatrix) is BooleanType: |
166 distanceMatrix = distanceMatrix[0] | 167 logging.error( |
167 | 168 "MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.") |
168 #Log distance matrix | 169 return False |
169 logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric)) | 170 |
170 | 171 # Handle unifrac output |
171 distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True) | 172 if sMetric in [Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted]: |
172 | 173 distanceMatrix = distanceMatrix[0] |
173 #Create object to determine clusters/medoids | 174 |
174 medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance) | 175 # Log distance matrix |
175 #medoidsData includes(1d numpy array, medoids indexes; | 176 logging.debug( |
176 # 1d numpy array, non-medoids indexes; | 177 "MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric)) |
177 # 1d numpy array, cluster membership for non-medoids; | 178 |
178 # double, cost of configuration) | 179 distance = MLPYDistanceAdaptor( |
179 #npaMatrix is samples x rows | 180 npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True) |
180 #Build a matrix of lists of indicies to pass to the distance matrix | 181 |
181 lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))] | 182 # Create object to determine clusters/medoids |
182 medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix)) | 183 medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance) |
183 logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:") | 184 # medoidsData includes(1d numpy array, medoids indexes; |
184 logging.debug(str(medoidsData)) | 185 # 1d numpy array, non-medoids indexes; |
185 | 186 # 1d numpy array, cluster membership for non-medoids; |
186 #If returning the same amount of clusters and samples | 187 # double, cost of configuration) |
187 #Return centroids | 188 # npaMatrix is samples x rows |
188 selectedIndexes = medoidsData[0] | 189 # Build a matrix of lists of indicies to pass to the distance matrix |
189 return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)] | 190 lliIndicesMatrix = [[iIndexPosition] |
190 | 191 for iIndexPosition in xrange(0, len(npaMatrix))] |
191 ####Group 3## Highest Dissimilarity | 192 medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix)) |
192 #Testing: Happy path tested | 193 logging.debug( |
193 def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): | 194 "MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:") |
194 """ | 195 logging.debug(str(medoidsData)) |
195 Select extreme samples from HClustering. | 196 |
196 | 197 # If returning the same amount of clusters and samples |
197 :param strBetaMetric: The beta metric to use for distance matrix generation. | 198 # Return centroids |
198 :type: String The name of the beta metric to use. | 199 selectedIndexes = medoidsData[0] |
199 :param npaAbundanceMatrix: Numpy array where row=samples and columns=features. | 200 return [lsSampleNames[selectedIndexes[index]] for index in xrange(0, iNumberSamplesReturned)] |
200 :type: Numpy Array Abundance data. | 201 |
201 :param lsSampleNames: The names of the sample. | 202 # Group 3## Highest Dissimilarity |
202 :type: List List of strings. | 203 # Testing: Happy path tested |
203 :param iSelectSampleCount: Number of samples to select (return). | 204 def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): |
204 :type: Integer Integer number of samples returned. | 205 """ |
205 :return Samples: List of samples. | 206 Select extreme samples from HClustering. |
206 :param istmBetaMatrix: File with beta-diversity matrix | 207 |
207 :type: File stream or file path string | 208 :param strBetaMetric: The beta metric to use for distance matrix generation. |
208 """ | 209 :type: String The name of the beta metric to use. |
209 | 210 :param npaAbundanceMatrix: Numpy array where row=samples and columns=features. |
210 #If they want all the sample count, return all sample names | 211 :type: Numpy Array Abundance data. |
211 iSampleCount=len(npaAbundanceMatrix[:,0]) | 212 :param lsSampleNames: The names of the sample. |
212 if iSelectSampleCount==iSampleCount: | 213 :type: List List of strings. |
213 return lsSampleNames | 214 :param iSelectSampleCount: Number of samples to select (return). |
214 | 215 :type: Integer Integer number of samples returned. |
215 #Holds the samples to be returned | 216 :return Samples: List of samples. |
216 lsReturnSamplesRet = [] | 217 :param istmBetaMatrix: File with beta-diversity matrix |
217 | 218 :type: File stream or file path string |
218 #Generate beta matrix | 219 """ |
219 #Returns condensed matrix | 220 |
220 tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True) | 221 # If they want all the sample count, return all sample names |
221 | 222 iSampleCount = len(npaAbundanceMatrix[:, 0]) |
222 if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: | 223 if iSelectSampleCount == iSampleCount: |
223 tempDistanceMatrix = tempDistanceMatrix[0] | 224 return lsSampleNames |
224 | 225 |
225 if type(tempDistanceMatrix) is BooleanType: | 226 # Holds the samples to be returned |
226 logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.") | 227 lsReturnSamplesRet = [] |
227 return False | 228 |
228 | 229 # Generate beta matrix |
229 if istmBetaMatrix: | 230 # Returns condensed matrix |
230 tempDistanceMatrix = 1-tempDistanceMatrix | 231 tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix, lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric( |
231 | 232 npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse=True) |
232 #Feed beta matrix to linkage to cluster | 233 |
233 #Send condensed matrix | 234 if strBetaMetric in [Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted]: |
234 linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod) | 235 tempDistanceMatrix = tempDistanceMatrix[0] |
235 | 236 |
236 #Extract cluster information from dendrogram | 237 if type(tempDistanceMatrix) is BooleanType: |
237 #The linakge matrix is of the form | 238 logging.error( |
238 #[[int1 int2 doube int3],...] | 239 "MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.") |
239 #int1 and int1 are the paired samples indexed at 0 and up. | 240 return False |
240 #each list is an entry for a branch that is number starting with the first | 241 |
241 #list being sample count index + 1 | 242 if istmBetaMatrix: |
242 #each list is then named by an increment as they appear | 243 tempDistanceMatrix = 1-tempDistanceMatrix |
243 #this means that if a number is in the list and is = sample count or greater it is not | 244 |
244 #terminal and is instead a branch. | 245 # Feed beta matrix to linkage to cluster |
245 #This method just takes the lowest metric measurement (highest distance pairs/clusters) | 246 # Send condensed matrix |
246 #Works much better than the original technique | 247 linkageMatrix = hcluster.linkage( |
247 #get total number of samples | 248 tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod) |
248 | 249 |
249 iCurrentSelectCount = 0 | 250 # Extract cluster information from dendrogram |
250 for row in linkageMatrix: | 251 # The linakge matrix is of the form |
251 #Get nodes ofthe lowest pairing (so the furthest apart pair) | 252 # [[int1 int2 doube int3],...] |
252 iNode1 = int(row[0]) | 253 # int1 and int1 are the paired samples indexed at 0 and up. |
253 iNode2 = int(row[1]) | 254 # each list is an entry for a branch that is number starting with the first |
254 #Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram | 255 # list being sample count index + 1 |
255 #The branching in the dendrogram will start at the number of samples and increment higher. | 256 # each list is then named by an increment as they appear |
256 #Add each of the pair one at a time breaking when enough samples are selected. | 257 # this means that if a number is in the list and is = sample count or greater it is not |
257 if iNode1<iSampleCount: | 258 # terminal and is instead a branch. |
258 lsReturnSamplesRet.append(lsSampleNames[iNode1]) | 259 # This method just takes the lowest metric measurement (highest distance pairs/clusters) |
259 iCurrentSelectCount = iCurrentSelectCount + 1 | 260 # Works much better than the original technique |
260 if iCurrentSelectCount == iSelectSampleCount: | 261 # get total number of samples |
261 break | 262 |
262 if iNode2<iSampleCount: | 263 iCurrentSelectCount = 0 |
263 lsReturnSamplesRet.append(lsSampleNames[iNode2]) | 264 for row in linkageMatrix: |
264 iCurrentSelectCount = iCurrentSelectCount + 1 | 265 # Get nodes ofthe lowest pairing (so the furthest apart pair) |
265 if iCurrentSelectCount == iSelectSampleCount: | 266 iNode1 = int(row[0]) |
266 break | 267 iNode2 = int(row[1]) |
267 | 268 # Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram |
268 #Return selected samples | 269 # The branching in the dendrogram will start at the number of samples and increment higher. |
269 return lsReturnSamplesRet | 270 # Add each of the pair one at a time breaking when enough samples are selected. |
270 | 271 if iNode1 < iSampleCount: |
271 ####Group 4## Rank Average of user Defined Taxa | 272 lsReturnSamplesRet.append(lsSampleNames[iNode1]) |
272 #Testing: Happy Path Tested | 273 iCurrentSelectCount = iCurrentSelectCount + 1 |
273 def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False): | 274 if iCurrentSelectCount == iSelectSampleCount: |
274 """ | 275 break |
275 Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped. | 276 if iNode2 < iSampleCount: |
276 | 277 lsReturnSamplesRet.append(lsSampleNames[iNode2]) |
277 :param abndTable: Abundance Table to analyse | 278 iCurrentSelectCount = iCurrentSelectCount + 1 |
278 :type: AbundanceTable Abundance Table | 279 if iCurrentSelectCount == iSelectSampleCount: |
279 :param lsTargetedFeature: String names | 280 break |
280 :type: list list of string names of features (bugs) which are measured after ranking against the full sample | 281 |
281 :param fRank: Indicates to rank the abundance before getting the average abundance of the features (default false) | 282 # Return selected samples |
282 :type: boolean Flag indicating ranking abundance before calculating average feature measurement (false= no ranking) | 283 return lsReturnSamplesRet |
283 :return List of lists or boolean: List of lists or False on error. One internal list per sample indicating the sample, | 284 |
284 feature average abundance or ranked abundance. Lists will already be sorted. | 285 # Group 4## Rank Average of user Defined Taxa |
285 For not Ranked [[sample,average abundance of selected feature,1]] | 286 # Testing: Happy Path Tested |
286 For Ranked [[sample,average ranked abundance, average abundance of selected feature]] | 287 def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False): |
287 Error Returns false | 288 """ |
288 """ | 289 Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped. |
289 | 290 |
290 llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature) | 291 :param abndTable: Abundance Table to analyse |
291 if not llAbundance: | 292 :type: AbundanceTable Abundance Table |
292 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") | 293 :param lsTargetedFeature: String names |
293 return False | 294 :type: list list of string names of features (bugs) which are measured after ranking against the full sample |
294 #Add a space for ranking if needed | 295 :param fRank: Indicates to rank the abundance before getting the average abundance of the features (default false) |
295 #Not ranked will be [[sSample,average abundance,1]] | 296 :type: boolean Flag indicating ranking abundance before calculating average feature measurement (false= no ranking) |
296 #(where 1 will not discriminant ties if used in later functions, so this generalizes) | 297 :return List of lists or boolean: List of lists or False on error. One internal list per sample indicating the sample, |
297 #Ranked will be [[sSample, average rank, average abundance]] | 298 feature average abundance or ranked abundance. Lists will already be sorted. |
298 llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance] | 299 For not Ranked [[sample,average abundance of selected feature,1]] |
299 #Rank if needed | 300 For Ranked [[sample,average ranked abundance, average abundance of selected feature]] |
300 if fRank: | 301 Error Returns false |
301 abndRanked = abndTable.funcRankAbundance() | 302 """ |
302 if abndRanked == None: | 303 |
303 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.") | 304 llAbundance = abndTable.funcGetAverageAbundancePerSample( |
304 return False | 305 lsTargetedFeature) |
305 llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature) | 306 if not llAbundance: |
306 if not llRetRank: | 307 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") |
307 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") | 308 return False |
308 return False | 309 # Add a space for ranking if needed |
309 dictRanks = dict(llRetRank) | 310 # Not ranked will be [[sSample,average abundance,1]] |
310 llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance] | 311 # (where 1 will not discriminant ties if used in later functions, so this generalizes) |
311 | 312 # Ranked will be [[sSample, average rank, average abundance]] |
312 #Sort first for ties and then for the main feature | 313 llRetAbundance = [[llist[0], -1, llist[1]] for llist in llAbundance] |
313 if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity: | 314 # Rank if needed |
314 llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank) | 315 if fRank: |
315 if fRank: | 316 abndRanked = abndTable.funcRankAbundance() |
316 llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank) | 317 if abndRanked == None: |
317 return llRetAbundance | 318 logging.error( |
318 | 319 "MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.") |
319 #Testing: Happy Path Tested | 320 return False |
320 def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]): | 321 llRetRank = abndRanked.funcGetAverageAbundancePerSample( |
321 """ | 322 lsTargetedFeature) |
322 Selects samples with the highest ranks or abundance of targeted features. | 323 if not llRetRank: |
323 If ranked, select the highest abundance for tie breaking | 324 logging.error( |
324 | 325 "MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") |
325 :param abndMatrix: Abundance table to analyse | 326 return False |
326 :type: AbundanceTable Abundance table | 327 dictRanks = dict(llRetRank) |
327 :param lsTargetedTaxa: List of features | 328 llRetAbundance = [[a[0], dictRanks[a[0]], a[2]] |
328 :type: list list of strings | 329 for a in llRetAbundance] |
329 :param iSampleSelectionCount: Number of samples to select | 330 |
330 :type: integer integer | 331 # Sort first for ties and then for the main feature |
331 :param sMethod: Method to select targeted features | 332 if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity: |
332 :type: string String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues) | 333 llRetAbundance = sorted( |
333 :return List of strings: List of sample names which were selected | 334 llRetAbundance, key=lambda sampleData: sampleData[2], reverse=not fRank) |
334 List of strings Empty list is returned on an error. | 335 if fRank: |
335 """ | 336 llRetAbundance = sorted( |
336 | 337 llRetAbundance, key=lambda sampleData: sampleData[1], reverse=not fRank) |
337 #Check data | 338 return llRetAbundance |
338 if(len(lsTargetedTaxa) < 1): | 339 |
339 logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.") | 340 # Testing: Happy Path Tested |
340 return [] | 341 def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod=ConstantsMicropita.lsTargetedFeatureMethodValues[0]): |
341 | 342 """ |
342 lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa, | 343 Selects samples with the highest ranks or abundance of targeted features. |
343 fRank=sMethod.lower() == self.c_strTargetedRanked.lower()) | 344 If ranked, select the highest abundance for tie breaking |
344 #If an error occured or the key word for the method was not recognized | 345 |
345 if lsTargetedSamples == False: | 346 :param abndMatrix: Abundance table to analyse |
346 logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.") | 347 :type: AbundanceTable Abundance table |
347 return [] | 348 :param lsTargetedTaxa: List of features |
348 | 349 :type: list list of strings |
349 #Select from results | 350 :param iSampleSelectionCount: Number of samples to select |
350 return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]] | 351 :type: integer integer |
351 | 352 :param sMethod: Method to select targeted features |
352 ####Group 5## Random | 353 :type: string String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues) |
353 #Testing: Happy path Tested | 354 :return List of strings: List of sample names which were selected |
354 def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0): | 355 List of strings Empty list is returned on an error. |
355 """ | 356 """ |
356 Returns random sample names of the number given. No replacement. | 357 |
357 | 358 # Check data |
358 :param lsSamples: List of sample names | 359 if(len(lsTargetedTaxa) < 1): |
359 :type: list list of strings | 360 logging.error( |
360 :param iNumberOfSamplesToReturn: Number of samples to select | 361 "MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.") |
361 :type: integer integer. | 362 return [] |
362 :return List: List of selected samples (strings). | 363 |
363 """ | 364 lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa, |
364 | 365 fRank=sMethod.lower() == self.c_strTargetedRanked.lower()) |
365 #Input matrix sample count | 366 # If an error occured or the key word for the method was not recognized |
366 sampleCount = len(lsSamples) | 367 if lsTargetedSamples == False: |
367 | 368 logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.") |
368 #Return the full matrix if they ask for a return matrix where length == original | 369 return [] |
369 if(iNumberOfSamplesToReturn >= sampleCount): | 370 |
370 return lsSamples | 371 # Select from results |
371 | 372 return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]] |
372 #Get the random indices for the sample (without replacement) | 373 |
373 liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn) | 374 # Group 5## Random |
374 | 375 # Testing: Happy path Tested |
375 #Create a boolean array of if indexes are to be included in the reduced array | 376 def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0): |
376 return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices] | 377 """ |
377 | 378 Returns random sample names of the number given. No replacement. |
378 #Happy path tested (case 3) | 379 |
379 def funcGetAveragePopulation(self, abndTable, lfCompress): | 380 :param lsSamples: List of sample names |
380 """ | 381 :type: list list of strings |
381 Get the average row per column in the abndtable. | 382 :param iNumberOfSamplesToReturn: Number of samples to select |
382 | 383 :type: integer integer. |
383 :param abndTable: AbundanceTable of data to be averaged | 384 :return List: List of selected samples (strings). |
384 :type: AbudanceTable | 385 """ |
385 :param lfCompress: List of boolean flags (false means to remove sample before averaging | 386 |
386 :type: List of floats | 387 # Input matrix sample count |
387 :return List of doubles: | 388 sampleCount = len(lsSamples) |
388 """ | 389 |
389 if sum(lfCompress) == 0: | 390 # Return the full matrix if they ask for a return matrix where length == original |
390 return [] | 391 if(iNumberOfSamplesToReturn >= sampleCount): |
391 | 392 return lsSamples |
392 #Get the average populations | 393 |
393 lAverageRet = [] | 394 # Get the random indices for the sample (without replacement) |
394 | 395 liRandomIndices = random.sample( |
395 for sFeature in abndTable.funcGetAbundanceCopy(): | 396 range(sampleCount), iNumberOfSamplesToReturn) |
396 sFeature = list(sFeature)[1:] | 397 |
397 sFeature=np.compress(lfCompress,sFeature,axis=0) | 398 # Create a boolean array of if indexes are to be included in the reduced array |
398 lAverageRet.append(sum(sFeature)/float(len(sFeature))) | 399 return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices] |
399 return lAverageRet | 400 |
400 | 401 # Happy path tested (case 3) |
401 #Happy path tested (2 cases) | 402 def funcGetAveragePopulation(self, abndTable, lfCompress): |
402 def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected): | 403 """ |
403 """ | 404 Get the average row per column in the abndtable. |
404 Given an abundance table and an average sample, this returns the distance of each sample | 405 |
405 (measured using brays-curtis dissimilarity) from the average. | 406 :param abndTable: AbundanceTable of data to be averaged |
406 The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected | 407 :type: AbudanceTable |
407 (which is associated with the samples in the order of the samples in the abundance table; | 408 :param lfCompress: List of boolean flags (false means to remove sample before averaging |
408 use abundancetable.funcGetSampleNames() to see the order if needed). | 409 :type: List of floats |
409 | 410 :return List of doubles: |
410 :param abndTable: Abundance table holding the data to be analyzed. | 411 """ |
411 :type: AbundanceTable | 412 if sum(lfCompress) == 0: |
412 :param ldAverage: Average population (Average features of the abundance table of samples) | 413 return [] |
413 :type: List of doubles which represent the average population | 414 |
414 :param lsSamples: These are the only samples used in the analysis | 415 # Get the average populations |
415 :type: List of strings (sample ids) | 416 lAverageRet = [] |
416 :param lfSelected: Samples to be included in the analysis | 417 |
417 :type: List of boolean (true means include) | 418 for sFeature in abndTable.funcGetAbundanceCopy(): |
418 :return: List of distances (doubles) | 419 sFeature = list(sFeature)[1:] |
419 """ | 420 sFeature = np.compress(lfCompress, sFeature, axis=0) |
420 #Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists | 421 lAverageRet.append(sum(sFeature)/float(len(sFeature))) |
421 ldSelectedDistances = [] | 422 return lAverageRet |
422 | 423 |
423 for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]: | 424 # Happy path tested (2 cases) |
424 #Get the sample measurements | 425 def funcGetDistanceFromAverage(self, abndTable, ldAverage, lsSamples, lfSelected): |
425 ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0]) | 426 """ |
426 return ldSelectedDistances | 427 Given an abundance table and an average sample, this returns the distance of each sample |
427 | 428 (measured using brays-curtis dissimilarity) from the average. |
428 #Happy path tested (1 case) | 429 The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected |
429 def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther): | 430 (which is associated with the samples in the order of the samples in the abundance table; |
430 """ | 431 use abundancetable.funcGetSampleNames() to see the order if needed). |
431 Get the distance of samples from one label from the average sample of not the label. | 432 |
432 Note: This assumes 2 classes. | 433 :param abndTable: Abundance table holding the data to be analyzed. |
433 | 434 :type: AbundanceTable |
434 :param abndTable: Table of data to work out of. | 435 :param ldAverage: Average population (Average features of the abundance table of samples) |
435 :type: Abundace Table | 436 :type: List of doubles which represent the average population |
436 :param lfGroupOfInterest: Boolean indicator of the sample being in the first group. | 437 :param lsSamples: These are the only samples used in the analysis |
437 :type: List of floats, true indicating an individual in the group of interest. | 438 :type: List of strings (sample ids) |
438 :param lfGroupOther: Boolean indicator of the sample being in the other group. | 439 :param lfSelected: Samples to be included in the analysis |
439 :type: List of floats, true indicating an individual in the | 440 :type: List of boolean (true means include) |
440 :return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population] | 441 :return: List of distances (doubles) |
441 """ | 442 """ |
442 #Get all sample names | 443 # Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists |
443 lsAllSamples = abndTable.funcGetSampleNames() | 444 ldSelectedDistances = [] |
444 | 445 |
445 #Get average populations | 446 for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]: |
446 lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther) | 447 # Get the sample measurements |
447 | 448 ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity( |
448 #Get the distance from the average of the other label (label 1) | 449 np.array([abndTable.funcGetSample(sSampleName), ldAverage]))[0]) |
449 ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther, | 450 return ldSelectedDistances |
450 lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest) | 451 |
451 | 452 # Happy path tested (1 case) |
452 return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances) | 453 def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther): |
453 | 454 """ |
454 #Happy path tested (1 test case) | 455 Get the distance of samples from one label from the average sample of not the label. |
455 def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest): | 456 Note: This assumes 2 classes. |
456 """ | 457 |
457 Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group. | 458 :param abndTable: Table of data to work out of. |
458 An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group. | 459 :type: Abundace Table |
459 | 460 :param lfGroupOfInterest: Boolean indicator of the sample being in the first group. |
460 :params abndTable: Abundance of measurements | 461 :type: List of floats, true indicating an individual in the group of interest. |
461 :type: AbundanceTable | 462 :param lfGroupOther: Boolean indicator of the sample being in the other group. |
462 :params iSelectionCount: The number of samples selected per sample. | 463 :type: List of floats, true indicating an individual in the |
463 :type: Integer Integer greater than 0 | 464 :return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population] |
464 :params sLabel: ID of the metadata which is the supervised label | 465 """ |
465 :type: String | 466 # Get all sample names |
466 :params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest. | 467 lsAllSamples = abndTable.funcGetSampleNames() |
467 :type: String found in the abundance table metadata row indicated by sLabel. | 468 |
468 :return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]] | 469 # Get average populations |
469 """ | 470 lAverageOther = self.funcGetAveragePopulation( |
470 | 471 abndTable=abndTable, lfCompress=lfGroupOther) |
471 lsMetadata = abndTable.funcGetMetadata(sLabel) | 472 |
472 #Other metadata values | 473 # Get the distance from the average of the other label (label 1) |
473 lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest)) | 474 ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther, |
474 | 475 lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest) |
475 #Get boolean indicator of values of interest | 476 |
476 lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata] | 477 return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup], ldSelectedDistances) |
477 | 478 |
478 #Get the distances of the items of interest from the other metadata values | 479 # Happy path tested (1 test case) |
479 dictDistanceAverages = {} | 480 def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest): |
480 for sOtherLabel in lsUniqueOtherValues: | 481 """ |
481 #Get boolean indicator of labels not of interest | 482 Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group. |
482 lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata] | 483 An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group. |
483 | 484 |
484 #Get the distances of data from two different groups to the average of the other | 485 :params abndTable: Abundance of measurements |
485 ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther)) | 486 :type: AbundanceTable |
486 | 487 :params iSelectionCount: The number of samples selected per sample. |
487 for sKey in ldValueDistances: | 488 :type: Integer Integer greater than 0 |
488 dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey] | 489 :params sLabel: ID of the metadata which is the supervised label |
489 | 490 :type: String |
490 #Finish average by dividing by length of lsUniqueOtherValues | 491 :params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest. |
491 ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages] | 492 :type: String found in the abundance table metadata row indicated by sLabel. |
492 | 493 :return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]] |
493 #Sort to extract extremes | 494 """ |
494 ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1)) | 495 |
495 | 496 lsMetadata = abndTable.funcGetMetadata(sLabel) |
496 #Get the closest and farthest distances | 497 # Other metadata values |
497 ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount] | 498 lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest)) |
498 ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:] | 499 |
499 | 500 # Get boolean indicator of values of interest |
500 #Remove the selected samples from the larger population of distances (better visualization) | 501 lfLabelsInterested = [sValueOfInterest == |
501 ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples] | 502 sValue for sValue in lsMetadata] |
502 | 503 |
503 #Return discriminant tuples, distinct tuples, other tuples | 504 # Get the distances of the items of interest from the other metadata values |
504 return [ltupleDiscriminantSamples, ltupleDistinctSamples, | 505 dictDistanceAverages = {} |
505 [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]] | 506 for sOtherLabel in lsUniqueOtherValues: |
506 | 507 # Get boolean indicator of labels not of interest |
507 #Run the supervised method surrounding distance from centroids | 508 lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata] |
508 #Happy path tested (3 test cases) | 509 |
509 def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant, | 510 # Get the distances of data from two different groups to the average of the other |
510 xOutputSupFile, xPredictSupFile, strSupervisedMetadata, | 511 ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel( |
511 iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False): | 512 abndTable, lfLabelsInterested, lfLabelsOther)) |
512 """ | 513 |
513 Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group. | 514 for sKey in ldValueDistances: |
514 | 515 dictDistanceAverages[sKey] = ldValueDistances[sKey] + \ |
515 :param abundanceTable: AbundanceTable | 516 dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey] |
516 :type: AbudanceTable Data to analyze | 517 |
517 :param fRunDistinct: Run distinct selection method | 518 # Finish average by dividing by length of lsUniqueOtherValues |
518 :type: Boolean boolean (true runs method) | 519 ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float( |
519 :param fRunDiscriminant: Run discriminant method | 520 len(lsUniqueOtherValues))) for sKey in dictDistanceAverages] |
520 :type: Boolean boolean (true runs method) | 521 |
521 :param xOutputSupFile: File output from supervised methods detailing data going into the method. | 522 # Sort to extract extremes |
522 :type: String or FileStream | 523 ltpleAverageDistances = sorted( |
523 :param xPredictSupFile: File output from supervised methods distance results from supervised methods. | 524 ltpleAverageDistances, key=operator.itemgetter(1)) |
524 :type: String or FileStream | 525 |
525 :param strSupervisedMetadata: The metadata that will be used to group samples. | 526 # Get the closest and farthest distances |
526 :type: String | 527 ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount] |
527 :param iSampleSupSelectionCount: Number of samples to select | 528 ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:] |
528 :type: Integer int sample selection count | 529 |
529 :param lsOriginalSampleNames: List of the sample names, order is important and should be preserved from the abundanceTable. | 530 # Remove the selected samples from the larger population of distances (better visualization) |
530 :type: List of samples | 531 ldSelected = [tpleSelected[0] |
531 :param fAppendFiles: Indicates that output files already exist and appending is occuring. | 532 for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples] |
532 :type: Boolean | 533 |
533 :return Selected Samples: A dictionary of selected samples by selection ID | 534 # Return discriminant tuples, distinct tuples, other tuples |
534 Dictionary {"Selection Method":["SampleID","SampleID"...]} | 535 return [ltupleDiscriminantSamples, ltupleDistinctSamples, |
535 """ | 536 [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]] |
536 #Get labels and run one label against many | 537 |
537 lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata) | 538 # Run the supervised method surrounding distance from centroids |
538 dictlltpleDistanceMeasurements = {} | 539 # Happy path tested (3 test cases) |
539 for sMetadataValue in set(lstrMetadata): | 540 def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant, |
540 | 541 xOutputSupFile, xPredictSupFile, strSupervisedMetadata, |
541 #For now perform the selection here for the label of interest against the other labels | 542 iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles=False): |
542 dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable, | 543 """ |
543 iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue)) | 544 Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group. |
544 | 545 |
545 #Make expected output files for supervised methods | 546 :param abundanceTable: AbundanceTable |
546 #1. Output file which is similar to an input file for SVMs | 547 :type: AbudanceTable Data to analyze |
547 #2. Output file that is similar to the probabilitic output of a SVM (LibSVM) | 548 :param fRunDistinct: Run distinct selection method |
548 #Manly for making output of supervised methods (Distance from Centroid) similar | 549 :type: Boolean boolean (true runs method) |
549 #MicropitaVis needs some of these files | 550 :param fRunDiscriminant: Run discriminant method |
550 if xOutputSupFile: | 551 :type: Boolean boolean (true runs method) |
551 if fAppendFiles: | 552 :param xOutputSupFile: File output from supervised methods detailing data going into the method. |
552 SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, | 553 :type: String or FileStream |
553 lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) | 554 :param xPredictSupFile: File output from supervised methods distance results from supervised methods. |
554 else: | 555 :type: String or FileStream |
555 SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, | 556 :param strSupervisedMetadata: The metadata that will be used to group samples. |
556 sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) | 557 :type: String |
557 | 558 :param iSampleSupSelectionCount: Number of samples to select |
558 #Will contain the samples selected to return | 559 :type: Integer int sample selection count |
559 #One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type | 560 :param lsOriginalSampleNames: List of the sample names, order is important and should be preserved from the abundanceTable. |
560 dictSelectedSamplesRet = dict() | 561 :type: List of samples |
561 for sKey, ltplDistances in dictlltpleDistanceMeasurements.items(): | 562 :param fAppendFiles: Indicates that output files already exist and appending is occuring. |
562 if fRunDistinct: | 563 :type: Boolean |
563 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]]) | 564 :return Selected Samples: A dictionary of selected samples by selection ID |
564 if fRunDiscriminant: | 565 Dictionary {"Selection Method":["SampleID","SampleID"...]} |
565 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]]) | 566 """ |
566 | 567 # Get labels and run one label against many |
567 if xPredictSupFile: | 568 lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata) |
568 dictFlattenedDistances = dict() | 569 dictlltpleDistanceMeasurements = {} |
569 [dictFlattenedDistances.setdefault(sKey, []).append(tple) | 570 for sMetadataValue in set(lstrMetadata): |
570 for sKey, lltple in dictlltpleDistanceMeasurements.items() | 571 |
571 for ltple in lltple for tple in ltple] | 572 # For now perform the selection here for the label of interest against the other labels |
572 if fAppendFiles: | 573 dictlltpleDistanceMeasurements.setdefault(sMetadataValue, []).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable, |
573 self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, | 574 iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue)) |
574 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) | 575 |
575 else: | 576 # Make expected output files for supervised methods |
576 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, | 577 # 1. Output file which is similar to an input file for SVMs |
577 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) | 578 # 2. Output file that is similar to the probabilitic output of a SVM (LibSVM) |
578 return dictSelectedSamplesRet | 579 # Manly for making output of supervised methods (Distance from Centroid) similar |
579 | 580 # MicropitaVis needs some of these files |
580 #Two happy path test cases | 581 if xOutputSupFile: |
581 def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames): | 582 if fAppendFiles: |
582 """ | 583 SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, |
583 Manages updating the predict file. | 584 lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) |
584 | 585 else: |
585 :param xPredictSupFile: File that has predictions (distances) from the supervised method. | 586 SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, |
586 :type: FileStream or String file path | 587 sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) |
587 :param xInputLabelsFile: File that as input to the supervised methods. | 588 |
588 :type: FileStream or String file path | 589 # Will contain the samples selected to return |
589 :param dictltpleDistanceMeasurements: | 590 # One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type |
590 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} | 591 dictSelectedSamplesRet = dict() |
591 """ | 592 for sKey, ltplDistances in dictlltpleDistanceMeasurements.items(): |
592 | 593 if fRunDistinct: |
593 if not isinstance(xPredictSupFile, str): | 594 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct, []).extend([ |
594 xPredictSupFile.close() | 595 ltple[0] for ltple in ltplDistances[1]]) |
595 xPredictSupFile = xPredictSupFile.name | 596 if fRunDiscriminant: |
596 csvr = open(xPredictSupFile,'r') | 597 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant, []).extend([ |
597 | 598 ltple[0] for ltple in ltplDistances[0]]) |
598 f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) | 599 |
599 lsHeader = f.next()[1:] | 600 if xPredictSupFile: |
600 dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader]) | 601 dictFlattenedDistances = dict() |
601 | 602 [dictFlattenedDistances.setdefault(sKey, []).append(tple) |
602 #Read data in | 603 for sKey, lltple in dictlltpleDistanceMeasurements.items() |
603 iSampleIndex = 0 | 604 for ltple in lltple for tple in ltple] |
604 for sRow in f: | 605 if fAppendFiles: |
605 sLabel = sRow[0] | 606 self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, |
606 [dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:]) | 607 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) |
607 if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue] | 608 else: |
608 iSampleIndex += 1 | 609 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, |
609 | 610 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) |
610 #Combine dictltpleDistanceMeasurements with new data | 611 return dictSelectedSamplesRet |
611 #If they share a key then merge keeping parameter data | 612 |
612 #If they do not share the key, keep the full data | 613 # Two happy path test cases |
613 dictNew = {} | 614 def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames): |
614 for sKey in dictltpleDistanceMeasurements.keys(): | 615 """ |
615 lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]] | 616 Manages updating the predict file. |
616 dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey] | 617 |
617 for sKey in dictlltpleRead: | 618 :param xPredictSupFile: File that has predictions (distances) from the supervised method. |
618 if sKey not in dictltpleDistanceMeasurements.keys(): | 619 :type: FileStream or String file path |
619 dictNew[sKey] = dictlltpleRead[sKey] | 620 :param xInputLabelsFile: File that as input to the supervised methods. |
620 | 621 :type: FileStream or String file path |
621 #Call writer | 622 :param dictltpleDistanceMeasurements: |
622 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile, | 623 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} |
623 dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable, | 624 """ |
624 lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True) | 625 |
625 | 626 if not isinstance(xPredictSupFile, str): |
626 #2 happy path test cases | 627 xPredictSupFile.close() |
627 def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False): | 628 xPredictSupFile = xPredictSupFile.name |
628 """ | 629 csvr = open(xPredictSupFile, 'r') |
629 Write to the predict file. | 630 |
630 | 631 f = csv.reader( |
631 :param xPredictSupFile: File that has predictions (distances) from the supervised method. | 632 csvr, delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) |
632 :type: FileStream or String file path | 633 lsHeader = f.next()[1:] |
633 :param xInputLabelsFile: File that as input to the supervised methods. | 634 dictlltpleRead = dict([(sHeader, []) for sHeader in lsHeader]) |
634 :type: FileStream or String file path | 635 |
635 :param dictltpleDistanceMeasurements: | 636 # Read data in |
636 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} | 637 iSampleIndex = 0 |
637 :param abundanceTable: An abundance table of the sample data. | 638 for sRow in f: |
638 :type: AbundanceTable | 639 sLabel = sRow[0] |
639 :param lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing. | 640 [dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex], dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:]) |
640 Otherwise will use the sample names from the abundance table. | 641 if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue] |
641 :type: List of strings | 642 iSampleIndex += 1 |
642 :param fFromUpdate: Indicates if this is part of an update to the file or not. | 643 |
643 :type: Boolean | 644 # Combine dictltpleDistanceMeasurements with new data |
644 """ | 645 # If they share a key then merge keeping parameter data |
645 | 646 # If they do not share the key, keep the full data |
646 xInputLabelsFileName = xInputLabelsFile | 647 dictNew = {} |
647 if not isinstance(xInputLabelsFile,str): | 648 for sKey in dictltpleDistanceMeasurements.keys(): |
648 xInputLabelsFileName = xInputLabelsFile.name | 649 lsSamples = [tple[0] |
649 f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) | 650 for tple in dictltpleDistanceMeasurements[sKey]] |
650 | 651 dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] |
651 lsAllSampleNames = abundanceTable.funcGetSampleNames() | 652 not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey] |
652 lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames, | 653 for sKey in dictlltpleRead: |
653 isPredictFile=False) | 654 if sKey not in dictltpleDistanceMeasurements.keys(): |
654 dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]]) | 655 dictNew[sKey] = dictlltpleRead[sKey] |
655 | 656 |
656 #Dictionay keys will be used to order the predict file | 657 # Call writer |
657 lsMeasurementKeys = dictltpleDistanceMeasurements.keys() | 658 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile, |
658 #Make header | 659 dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable, |
659 f.writerow(["labels"]+lsMeasurementKeys) | 660 lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True) |
660 | 661 |
661 #Reformat dictionary to make it easier to use | 662 # 2 happy path test cases |
662 for sKey in dictltpleDistanceMeasurements: | 663 def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False): |
663 dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]]) | 664 """ |
664 | 665 Write to the predict file. |
665 for sSample in lsOriginalSampleNames: | 666 |
666 #Make body of file | 667 :param xPredictSupFile: File that has predictions (distances) from the supervised method. |
667 f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+ | 668 :type: FileStream or String file path |
668 [str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)) | 669 :param xInputLabelsFile: File that as input to the supervised methods. |
669 for sKey in lsMeasurementKeys]) | 670 :type: FileStream or String file path |
670 | 671 :param dictltpleDistanceMeasurements: |
671 def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics, | 672 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} |
672 fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None, | 673 :param abundanceTable: An abundance table of the sample data. |
673 istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False): | 674 :type: AbundanceTable |
674 """ | 675 :param lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing. |
675 Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other | 676 Otherwise will use the sample names from the abundance table. |
676 for the set that should be normalized. | 677 :type: List of strings |
677 | 678 :param fFromUpdate: Indicates if this is part of an update to the file or not. |
678 :param abndData: Abundance table object holding the samples to be measured. | 679 :type: Boolean |
679 :type: AbundanceTable | 680 """ |
680 :param iSampleSelectionCount The number of samples to select per method. | 681 |
681 :type: Integer | 682 xInputLabelsFileName = xInputLabelsFile |
682 :param dictSelectedSamples Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}. | 683 if not isinstance(xInputLabelsFile, str): |
683 :type: Dictionary | 684 xInputLabelsFileName = xInputLabelsFile.name |
684 :param lsAlphaMetrics: List of alpha metrics to use on alpha metric dependent assays (like highest diversity). | 685 f = csv.writer(open(xPredictSupFile, "w") if isinstance(xPredictSupFile, str) |
685 :type: List of strings | 686 else xPredictSupFile, delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) |
686 :param lsBetaMetrics: List of beta metrics to use on beta metric dependent assays (like most representative). | 687 |
687 :type: List of strings | 688 lsAllSampleNames = abundanceTable.funcGetSampleNames() |
688 :param lsInverseBetaMetrics: List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar). | 689 lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames=lsOriginalSampleNames if fFromUpdate else lsAllSampleNames, |
689 :type: List of strings | 690 isPredictFile=False) |
690 :param fRunDiversity: Run Diversity based methods (true indicates run). | 691 dictLabels = dict([(sSample, sLabel) for sLabel in lsLabels.keys() |
691 :type: Boolean | 692 for sSample in lsLabels[sLabel]]) |
692 :param fRunRepresentative: Run Representative based methods (true indicates run). | 693 |
693 :type: Boolean | 694 # Dictionay keys will be used to order the predict file |
694 :param fRunExtreme: Run Extreme based methods (true indicates run). | 695 lsMeasurementKeys = dictltpleDistanceMeasurements.keys() |
695 :type: Boolean | 696 # Make header |
696 :param istmBetaMatrix: File that has a precalculated beta matrix | 697 f.writerow(["labels"]+lsMeasurementKeys) |
697 :type: File stream or File path string | 698 |
698 :return Selected Samples: Samples selected by methods. | 699 # Reformat dictionary to make it easier to use |
699 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} | 700 for sKey in dictltpleDistanceMeasurements: |
700 """ | 701 dictltpleDistanceMeasurements[sKey] = dict( |
701 | 702 [ltpl for ltpl in dictltpleDistanceMeasurements[sKey]]) |
702 #Sample ids/names | 703 |
703 lsSampleNames = abndData.funcGetSampleNames() | 704 for sSample in lsOriginalSampleNames: |
704 | 705 # Make body of file |
705 #Generate alpha metrics and get most diverse | 706 f.writerow([dictLabels.get(sSample, ConstantsMicropita.c_sEmptyPredictFileValue)] + |
706 if fRunDiversity: | 707 [str(dictltpleDistanceMeasurements[sKey].get(sSample, ConstantsMicropita.c_sEmptyPredictFileValue)) |
707 | 708 for sKey in lsMeasurementKeys]) |
708 #Get Alpha metrics matrix | 709 |
709 internalAlphaMatrix = None | 710 def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics, |
710 #Name of technique | 711 fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None, |
711 strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics | 712 istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False): |
712 | 713 """ |
713 #If given an alpha-diversity metadata | 714 Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other |
714 if strAlphaMetadata: | 715 for the set that should be normalized. |
715 internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]] | 716 |
716 else: | 717 :param abndData: Abundance table object holding the samples to be measured. |
717 #Expects Observations (Taxa (row) x sample (column)) | 718 :type: AbundanceTable |
718 #Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]] | 719 :param iSampleSelectionCount The number of samples to select per method. |
719 internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy() | 720 :type: Integer |
720 if not abndData.funcIsSummed() | 721 :param dictSelectedSamples Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}. |
721 else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(), | 722 :type: Dictionary |
722 lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics) | 723 :param lsAlphaMetrics: List of alpha metrics to use on alpha metric dependent assays (like highest diversity). |
723 | 724 :type: List of strings |
724 if internalAlphaMatrix: | 725 :param lsBetaMetrics: List of beta metrics to use on beta metric dependent assays (like most representative). |
725 #Invert measurments | 726 :type: List of strings |
726 if fInvertDiversity: | 727 :param lsInverseBetaMetrics: List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar). |
727 lldNewDiversity = [] | 728 :type: List of strings |
728 for lsLine in internalAlphaMatrix: | 729 :param fRunDiversity: Run Diversity based methods (true indicates run). |
729 lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine]) | 730 :type: Boolean |
730 internalAlphaMatrix = lldNewDiversity | 731 :param fRunRepresentative: Run Representative based methods (true indicates run). |
731 #Get top ranked alpha diversity by most diverse | 732 :type: Boolean |
732 #Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...] | 733 :param fRunExtreme: Run Extreme based methods (true indicates run). |
733 #Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]] | 734 :type: Boolean |
734 mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount) | 735 :param istmBetaMatrix: File that has a precalculated beta matrix |
735 | 736 :type: File stream or File path string |
736 #Add to results | 737 :return Selected Samples: Samples selected by methods. |
737 for index in xrange(0,len(strMethod)): | 738 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} |
738 strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index]) | 739 """ |
739 dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index]) | 740 |
740 | 741 # Sample ids/names |
741 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b") | 742 lsSampleNames = abndData.funcGetSampleNames() |
742 logging.info(dictSelectedSamples) | 743 |
743 | 744 # Generate alpha metrics and get most diverse |
744 #Generate beta metrics and | 745 if fRunDiversity: |
745 if fRunRepresentative or fRunExtreme: | 746 |
746 | 747 # Get Alpha metrics matrix |
747 #Abundance matrix transposed | 748 internalAlphaMatrix = None |
748 npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True) | 749 # Name of technique |
749 | 750 strMethod = [ |
750 #Get center selection using clusters/tiling | 751 strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics |
751 #This will be for beta metrics in normalized space | 752 |
752 if fRunRepresentative: | 753 # If given an alpha-diversity metadata |
753 | 754 if strAlphaMetadata: |
754 if istmBetaMatrix: | 755 internalAlphaMatrix = [ |
755 #Get representative dissimilarity samples | 756 [float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]] |
756 medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | 757 else: |
757 | 758 # Expects Observations (Taxa (row) x sample (column)) |
758 if medoidSamples: | 759 #Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]] |
759 dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples) | 760 internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance=abndData.funcGetAbundanceCopy() |
760 else: | 761 if not abndData.funcIsSummed() |
761 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.") | 762 else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(), |
762 for bMetric in lsBetaMetrics: | 763 lsSampleNames=lsSampleNames, lsDiversityMetricAlpha=lsAlphaMetrics) |
763 | 764 |
764 #Get representative dissimilarity samples | 765 if internalAlphaMatrix: |
765 medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | 766 # Invert measurments |
766 | 767 if fInvertDiversity: |
767 if medoidSamples: | 768 lldNewDiversity = [] |
768 dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples) | 769 for lsLine in internalAlphaMatrix: |
769 | 770 lldNewDiversity.append( |
770 #Get extreme selection using clusters, tiling | 771 [1/max(dValue, ConstantsMicropita.c_smallNumber) for dValue in lsLine]) |
771 if fRunExtreme: | 772 internalAlphaMatrix = lldNewDiversity |
772 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.") | 773 # Get top ranked alpha diversity by most diverse |
773 if istmBetaMatrix: | 774 # Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...] |
774 | 775 #Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]] |
775 #Samples for representative dissimilarity | 776 mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples( |
776 #This involves inverting the distance metric, | 777 lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount) |
777 #Taking the dendrogram level of where the number cluster == the number of samples to select | 778 |
778 #Returning a repersentative sample from each cluster | 779 # Add to results |
779 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | 780 for index in xrange(0, len(strMethod)): |
780 | 781 strSelectionMethod = self.dictConvertAMetricDiversity.get( |
781 #Add selected samples | 782 strMethod[index], ConstantsMicropita.c_strDiversity+"="+strMethod[index]) |
782 if extremeSamples: | 783 dictSelectedSamples.setdefault(strSelectionMethod, []).extend( |
783 dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples) | 784 mostDiverseAlphaSamplesIndexes[index]) |
784 | 785 |
785 else: | 786 logging.info( |
786 #Run KMedoids with inverse custom distance metric in normalized space | 787 "MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b") |
787 for bMetric in lsInverseBetaMetrics: | 788 logging.info(dictSelectedSamples) |
788 | 789 |
789 #Samples for representative dissimilarity | 790 # Generate beta metrics and |
790 #This involves inverting the distance metric, | 791 if fRunRepresentative or fRunExtreme: |
791 #Taking the dendrogram level of where the number cluster == the number of samples to select | 792 |
792 #Returning a repersentative sample from each cluster | 793 # Abundance matrix transposed |
793 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) | 794 npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix( |
794 | 795 abndData.funcGetAbundanceCopy(), fRemoveAdornments=True) |
795 #Add selected samples | 796 |
796 if extremeSamples: | 797 # Get center selection using clusters/tiling |
797 dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples) | 798 # This will be for beta metrics in normalized space |
798 | 799 if fRunRepresentative: |
799 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b") | 800 |
800 logging.info(dictSelectedSamples) | 801 if istmBetaMatrix: |
801 return dictSelectedSamples | 802 # Get representative dissimilarity samples |
802 | 803 medoidSamples = self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, |
803 def funcRun(self, strIDName, strLastMetadataName, istmInput, | 804 iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) |
804 ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput, | 805 |
805 cDelimiter, cFeatureNameDelimiter, strFeatureSelection, | 806 if medoidSamples: |
806 istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None, | 807 dictSelectedSamples.setdefault( |
807 strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None, | 808 ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom, []).extend(medoidSamples) |
808 iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False): | 809 else: |
809 """ | 810 logging.info( |
810 Manages the selection of samples given different metrics. | 811 "MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.") |
811 | 812 for bMetric in lsBetaMetrics: |
812 :param strIDName: Sample Id metadata row | 813 |
813 :type: String | 814 # Get representative dissimilarity samples |
814 :param strLastMetadataName: The id of the metadata positioned last in the abundance table. | 815 medoidSamples = self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, |
815 :type: String String metadata id. | 816 iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) |
816 :param istmInput: File to store input data to supervised methods. | 817 |
817 :type: FileStream of String file path | 818 if medoidSamples: |
818 :param ostmInputPredictFile: File to store distances from supervised methods. | 819 dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get( |
819 :type: FileStream or String file path | 820 bMetric, ConstantsMicropita.c_strRepresentative+"="+bMetric), []).extend(medoidSamples) |
820 :param ostmCheckedFile: File to store the AbundanceTable data after it is being checked. | 821 |
821 :type: FileStream or String file path | 822 # Get extreme selection using clusters, tiling |
822 :param ostmOutPut: File to store sample selection by methods of interest. | 823 if fRunExtreme: |
823 :type: FileStream or String file path | 824 logging.info( |
824 :param cDelimiter: Delimiter of abundance table. | 825 "MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.") |
825 :type: Character Char (default TAB). | 826 if istmBetaMatrix: |
826 :param cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades). | 827 |
827 :type: Character (default |). | 828 # Samples for representative dissimilarity |
828 :param stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance). | 829 # This involves inverting the distance metric, |
829 :type: String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues). | 830 # Taking the dendrogram level of where the number cluster == the number of samples to select |
830 :param istmFeatures: File which holds the features of interest if using targeted feature methodology. | 831 # Returning a repersentative sample from each cluster |
831 :type: FileStream or String file path | 832 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, |
832 :param iCount: Number of samples to select in each methods, supervised methods select this amount per label if possible. | 833 lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) |
833 :type: Integer integer. | 834 |
834 :param lstrMethods: List of strings indicating selection techniques. | 835 # Add selected samples |
835 :type: List of string method names | 836 if extremeSamples: |
836 :param strLabel: The metadata used for supervised labels. | 837 dictSelectedSamples.setdefault( |
837 :type: String | 838 ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom, []).extend(extremeSamples) |
838 :param strStratify: The metadata used to stratify unsupervised data. | 839 |
839 :type: String | 840 else: |
840 :param strCustomAlpha: Custom alpha diversity metric | 841 # Run KMedoids with inverse custom distance metric in normalized space |
841 :type: String | 842 for bMetric in lsInverseBetaMetrics: |
842 :param strCustomBeta: Custom beta diversity metric | 843 |
843 :type: String | 844 # Samples for representative dissimilarity |
844 :param strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling | 845 # This involves inverting the distance metric, |
845 :type: String | 846 # Taking the dendrogram level of where the number cluster == the number of samples to select |
846 :param istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling | 847 # Returning a repersentative sample from each cluster |
847 :type: FileStream or String file path | 848 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, |
848 :param istrmTree: File containing tree for phylogentic beta-diversity analysis | 849 iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) |
849 :type: FileStream or String file path | 850 |
850 :param istrmEnvr: File containing environment for phylogentic beta-diversity analysis | 851 # Add selected samples |
851 :type: FileStream or String file path | 852 if extremeSamples: |
852 :param iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples. | 853 dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get( |
853 :type: Integer | 854 bMetric, ConstantsMicropita.c_strExtreme+"="+bMetric), []).extend(extremeSamples) |
854 :param iMinSamples: Minimum sample count for the occurence filter. | 855 |
855 :type: Integer | 856 logging.info( |
856 :param fInvertDiversity: When true will invert diversity measurements before using. | 857 "MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b") |
857 :type: boolean | 858 logging.info(dictSelectedSamples) |
858 :return Selected Samples: Samples selected by methods. | 859 return dictSelectedSamples |
859 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} | 860 |
860 """ | 861 def funcRun(self, strIDName, strLastMetadataName, istmInput, |
861 | 862 ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput, |
862 #Holds the top ranked samples from different metrics | 863 cDelimiter, cFeatureNameDelimiter, strFeatureSelection, |
863 #dict[metric name] = [samplename,samplename...] | 864 istmFeatures, iCount, lstrMethods, strLastRowMetadata=None, strLabel=None, strStratify=None, |
864 selectedSamples = dict() | 865 strCustomAlpha=None, strCustomBeta=None, strAlphaMetadata=None, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, |
865 | 866 iMinSeqs=ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples=ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity=False): |
866 #If a target feature file is given make sure that targeted feature is in the selection methods, if not add | 867 """ |
867 if ConstantsMicropita.c_strFeature in lstrMethods: | 868 Manages the selection of samples given different metrics. |
868 if not istmFeatures: | 869 |
869 logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.") | 870 :param strIDName: Sample Id metadata row |
870 return False | 871 :type: String |
871 | 872 :param strLastMetadataName: The id of the metadata positioned last in the abundance table. |
872 #Diversity metrics to run | 873 :type: String String metadata id. |
873 #Use custom metrics if specified | 874 :param istmInput: File to store input data to supervised methods. |
874 #Custom beta metrics set to normalized only, custom alpha metrics set to count only | 875 :type: FileStream of String file path |
875 diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity] | 876 :param ostmInputPredictFile: File to store distances from supervised methods. |
876 diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity] | 877 :type: FileStream or String file path |
878 :param ostmCheckedFile: File to store the AbundanceTable data after it is being checked. | |
879 :type: FileStream or String file path | |
880 :param ostmOutPut: File to store sample selection by methods of interest. | |
881 :type: FileStream or String file path | |
882 :param cDelimiter: Delimiter of abundance table. | |
883 :type: Character Char (default TAB). | |
884 :param cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades). | |
885 :type: Character (default |). | |
886 :param stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance). | |
887 :type: String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues). | |
888 :param istmFeatures: File which holds the features of interest if using targeted feature methodology. | |
889 :type: FileStream or String file path | |
890 :param iCount: Number of samples to select in each methods, supervised methods select this amount per label if possible. | |
891 :type: Integer integer. | |
892 :param lstrMethods: List of strings indicating selection techniques. | |
893 :type: List of string method names | |
894 :param strLabel: The metadata used for supervised labels. | |
895 :type: String | |
896 :param strStratify: The metadata used to stratify unsupervised data. | |
897 :type: String | |
898 :param strCustomAlpha: Custom alpha diversity metric | |
899 :type: String | |
900 :param strCustomBeta: Custom beta diversity metric | |
901 :type: String | |
902 :param strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling | |
903 :type: String | |
904 :param istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling | |
905 :type: FileStream or String file path | |
906 :param istrmTree: File containing tree for phylogentic beta-diversity analysis | |
907 :type: FileStream or String file path | |
908 :param istrmEnvr: File containing environment for phylogentic beta-diversity analysis | |
909 :type: FileStream or String file path | |
910 :param iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples. | |
911 :type: Integer | |
912 :param iMinSamples: Minimum sample count for the occurence filter. | |
913 :type: Integer | |
914 :param fInvertDiversity: When true will invert diversity measurements before using. | |
915 :type: boolean | |
916 :return Selected Samples: Samples selected by methods. | |
917 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} | |
918 """ | |
919 | |
920 # Holds the top ranked samples from different metrics | |
921 # dict[metric name] = [samplename,samplename...] | |
922 selectedSamples = dict() | |
923 | |
924 # If a target feature file is given make sure that targeted feature is in the selection methods, if not add | |
925 if ConstantsMicropita.c_strFeature in lstrMethods: | |
926 if not istmFeatures: | |
927 logging.error( | |
928 "MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.") | |
929 return False | |
930 | |
931 # Diversity metrics to run | |
932 # Use custom metrics if specified | |
933 # Custom beta metrics set to normalized only, custom alpha metrics set to count only | |
934 diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [ | |
935 MicroPITA.c_strInverseSimpsonDiversity] | |
936 diversityMetricsBeta = [] if istmBetaMatrix else [ | |
937 strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity] | |
877 # inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity] | 938 # inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity] |
878 diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else [] | 939 diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [ |
879 diversityMetricsBetaNoNormalize = [] | 940 strCustomAlpha] if strCustomAlpha else [] |
941 diversityMetricsBetaNoNormalize = [] | |
880 # inverseDiversityMetricsBetaNoNormalize = [] | 942 # inverseDiversityMetricsBetaNoNormalize = [] |
881 | 943 |
882 #Targeted taxa | 944 # Targeted taxa |
883 userDefinedTaxa = [] | 945 userDefinedTaxa = [] |
884 | 946 |
885 #Perform different flows flags | 947 # Perform different flows flags |
886 c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods | 948 c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods |
887 c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods | 949 c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods |
888 c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods | 950 c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods |
889 c_RUN_RANK_AVERAGE_USER_4 = False | 951 c_RUN_RANK_AVERAGE_USER_4 = False |
890 if ConstantsMicropita.c_strFeature in lstrMethods: | 952 if ConstantsMicropita.c_strFeature in lstrMethods: |
891 c_RUN_RANK_AVERAGE_USER_4 = True | 953 c_RUN_RANK_AVERAGE_USER_4 = True |
892 if not istmFeatures: | 954 if not istmFeatures: |
893 logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.") | 955 logging.error( |
894 return False | 956 "MicroPITA.funcRun:: No taxa file was given for taxa selection.") |
895 #Read in taxa list, break down to lines and filter out empty strings | 957 return False |
896 userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines())) | 958 # Read in taxa list, break down to lines and filter out empty strings |
897 c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods | 959 userDefinedTaxa = filter(None, (s.strip() |
898 c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods | 960 for s in istmFeatures.readlines())) |
899 c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods | 961 c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods |
900 | 962 c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods |
901 #Read in abundance data | 963 c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods |
902 #Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0 | 964 |
903 #Abundance table object to read in and manage data | 965 # Read in abundance data |
904 totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples], | 966 # Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0 |
905 cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata, | 967 # Abundance table object to read in and manage data |
906 sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile) | 968 totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter=[iMinSeqs, iMinSamples], |
907 if not totalAbundanceTable: | 969 cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata, |
908 logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+ | 970 sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile) |
909 " This often occurs when the Last Metadata is not specified correctly."+ | 971 if not totalAbundanceTable: |
910 " Please check to make sure the Last Metadata selection is the row of the last metadata,"+ | 972 logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed." + |
911 " all values after this selection should be microbial measurements and should be numeric.") | 973 " This often occurs when the Last Metadata is not specified correctly." + |
912 return False | 974 " Please check to make sure the Last Metadata selection is the row of the last metadata," + |
913 | 975 " all values after this selection should be microbial measurements and should be numeric.") |
914 lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel | 976 return False |
915 | 977 |
916 dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy() | 978 lsOriginalLabels = SVM.funcMakeLabels( |
917 logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata)) | 979 totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel |
918 #If there is only 1 unique value for the labels, do not run the Supervised methods | 980 |
919 if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ): | 981 dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy() |
920 logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[]))) | 982 logging.debug("MicroPITA.funcRun:: Received metadata=" + |
921 return False | 983 str(dictTotalMetadata)) |
922 | 984 # If there is only 1 unique value for the labels, do not run the Supervised methods |
923 #Run unsupervised methods### | 985 if strLabel and (len(set(dictTotalMetadata.get(strLabel, []))) < 2): |
924 #Stratify the data if need be and drop the old data | 986 logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + |
925 lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable] | 987 str(dictTotalMetadata.get(strLabel, []))) |
926 | 988 return False |
927 #For each stratified abundance block or for the unstratfified abundance | 989 |
928 #Run the unsupervised blocks | 990 #Run unsupervised methods### |
929 fAppendSupFiles = False | 991 # Stratify the data if need be and drop the old data |
930 for stratAbundanceTable in lStratifiedAbundanceTables: | 992 lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata( |
931 logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName()) | 993 strStratify) if strStratify else [totalAbundanceTable] |
932 | 994 |
933 ###NOT SUMMED, NOT NORMALIZED | 995 # For each stratified abundance block or for the unstratfified abundance |
934 #Only perform if the data is not yet normalized | 996 # Run the unsupervised blocks |
935 if not stratAbundanceTable.funcIsNormalized( ): | 997 fAppendSupFiles = False |
936 #Need to first work with unnormalized data | 998 for stratAbundanceTable in lStratifiedAbundanceTables: |
937 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: | 999 logging.info("MicroPITA.funcRun:: Running abundance block:" + |
938 | 1000 stratAbundanceTable.funcGetName()) |
939 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, | 1001 |
940 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize, | 1002 # NOT SUMMED, NOT NORMALIZED |
941 lsBetaMetrics=diversityMetricsBetaNoNormalize, | 1003 # Only perform if the data is not yet normalized |
942 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize, | 1004 if not stratAbundanceTable.funcIsNormalized(): |
943 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, | 1005 # Need to first work with unnormalized data |
944 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata, | 1006 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: |
945 istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) | 1007 |
946 | 1008 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, |
947 | 1009 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize, |
948 #Generate selection by the rank average of user defined taxa | 1010 lsBetaMetrics=diversityMetricsBetaNoNormalize, |
949 #Expects (Taxa (row) by Samples (column)) | 1011 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize, |
950 #Expects a column 0 of taxa id that is skipped | 1012 fRunDiversity=c_RUN_MAX_DIVERSITY_1, fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, |
951 #Returns [(sample name,average,rank)] | 1013 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata, |
952 #SUMMED AND NORMALIZED | 1014 istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) |
953 stratAbundanceTable.funcSumClades() | 1015 |
954 #Normalize data at this point | 1016 # Generate selection by the rank average of user defined taxa |
955 stratAbundanceTable.funcNormalize() | 1017 # Expects (Taxa (row) by Samples (column)) |
956 if c_RUN_RANK_AVERAGE_USER_4: | 1018 # Expects a column 0 of taxa id that is skipped |
957 selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable, | 1019 # Returns [(sample name,average,rank)] |
958 lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection) | 1020 # SUMMED AND NORMALIZED |
959 logging.info("MicroPITA.funcRun:: Selected Samples Rank") | 1021 stratAbundanceTable.funcSumClades() |
960 logging.info(selectedSamples) | 1022 # Normalize data at this point |
961 | 1023 stratAbundanceTable.funcNormalize() |
962 ###SUMMED AND NORMALIZED analysis block | 1024 if c_RUN_RANK_AVERAGE_USER_4: |
963 #Diversity based metric will move reduce to terminal taxa as needed | 1025 selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable, |
964 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: | 1026 lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection) |
965 | 1027 logging.info("MicroPITA.funcRun:: Selected Samples Rank") |
966 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, | 1028 logging.info(selectedSamples) |
967 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha, | 1029 |
968 lsBetaMetrics=diversityMetricsBeta, | 1030 # SUMMED AND NORMALIZED analysis block |
969 lsInverseBetaMetrics=diversityMetricsBeta, | 1031 # Diversity based metric will move reduce to terminal taxa as needed |
970 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, | 1032 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: |
971 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, | 1033 |
972 istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) | 1034 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, |
973 | 1035 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha, |
974 #5::Select randomly | 1036 lsBetaMetrics=diversityMetricsBeta, |
975 #Expects sampleNames = List of sample names [name, name, name...] | 1037 lsInverseBetaMetrics=diversityMetricsBeta, |
976 if(c_RUN_RANDOM_5): | 1038 fRunDiversity=c_RUN_MAX_DIVERSITY_1, fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, |
977 #Select randomly from sample names | 1039 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, |
978 selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount) | 1040 istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) |
979 logging.info("MicroPITA.funcRun:: Selected Samples Random") | 1041 |
980 logging.info(selectedSamples) | 1042 # 5::Select randomly |
981 | 1043 # Expects sampleNames = List of sample names [name, name, name...] |
982 #Perform supervised selection | 1044 if(c_RUN_RANDOM_5): |
983 if c_RUN_DISTINCT or c_RUN_DISCRIMINANT: | 1045 # Select randomly from sample names |
984 if strLabel: | 1046 selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples( |
985 dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable, | 1047 lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount) |
986 fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT, | 1048 logging.info("MicroPITA.funcRun:: Selected Samples Random") |
987 xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile, | 1049 logging.info(selectedSamples) |
988 strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount, | 1050 |
989 lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(), | 1051 # Perform supervised selection |
990 lsOriginalLabels = lsOriginalLabels, | 1052 if c_RUN_DISTINCT or c_RUN_DISCRIMINANT: |
991 fAppendFiles=fAppendSupFiles) | 1053 if strLabel: |
992 | 1054 dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable, |
993 [selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()] | 1055 fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT, |
994 | 1056 xOutputSupFile=ostmInputPredictFile, xPredictSupFile=ostmPredictFile, |
995 if not fAppendSupFiles: | 1057 strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount, |
996 fAppendSupFiles = True | 1058 lsOriginalSampleNames=totalAbundanceTable.funcGetSampleNames(), |
997 logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised") | 1059 lsOriginalLabels=lsOriginalLabels, |
998 logging.info(selectedSamples) | 1060 fAppendFiles=fAppendSupFiles) |
999 return selectedSamples | 1061 |
1000 | 1062 [selectedSamples.setdefault(sKey, []).extend( |
1001 #Testing: Happy path tested | 1063 lValue) for sKey, lValue in dictSelectionRet.items()] |
1002 @staticmethod | 1064 |
1003 def funcWriteSelectionToFile(dictSelection,xOutputFilePath): | 1065 if not fAppendSupFiles: |
1004 """ | 1066 fAppendSupFiles = True |
1005 Writes the selection of samples by method to an output file. | 1067 logging.info( |
1006 | 1068 "MicroPITA.funcRun:: Selected Samples Unsupervised") |
1007 :param dictSelection: The dictionary of selections by method to be written to a file. | 1069 logging.info(selectedSamples) |
1008 :type: Dictionary The dictionary of selections by method {"method":["sample selected","sample selected"...]} | 1070 return selectedSamples |
1009 :param xOutputFilePath: FileStream or String path to file inwhich the dictionary is written. | 1071 |
1010 :type: String FileStream or String path to file | 1072 # Testing: Happy path tested |
1011 """ | 1073 @staticmethod |
1012 | 1074 def funcWriteSelectionToFile(dictSelection, xOutputFilePath): |
1013 if not dictSelection: | 1075 """ |
1014 return | 1076 Writes the selection of samples by method to an output file. |
1015 | 1077 |
1016 #Open file | 1078 :param dictSelection: The dictionary of selections by method to be written to a file. |
1017 f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim ) | 1079 :type: Dictionary The dictionary of selections by method {"method":["sample selected","sample selected"...]} |
1018 | 1080 :param xOutputFilePath: FileStream or String path to file inwhich the dictionary is written. |
1019 #Create output content from dictionary | 1081 :type: String FileStream or String path to file |
1020 for sKey in dictSelection: | 1082 """ |
1021 f.writerow([sKey]+dictSelection[sKey]) | 1083 |
1022 logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey])) | 1084 if not dictSelection: |
1023 | 1085 return |
1024 #Testing: Happy Path tested | 1086 |
1025 @staticmethod | 1087 # Open file |
1026 def funcReadSelectionFileToDictionary(xInputFile): | 1088 f = csv.writer(open(xOutputFilePath, "w") if isinstance( |
1027 """ | 1089 xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim) |
1028 Reads in an output selection file from micropita and formats it into a dictionary. | 1090 |
1029 | 1091 # Create output content from dictionary |
1030 :param xInputFile: String path to file or file stream to read and translate into a dictionary. | 1092 for sKey in dictSelection: |
1031 {"method":["sample selected","sample selected"...]} | 1093 f.writerow([sKey]+dictSelection[sKey]) |
1032 :type: FileStream or String Path to file | 1094 logging.debug( |
1033 :return Dictionary: Samples selected by methods. | 1095 "MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey])) |
1034 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} | 1096 |
1035 """ | 1097 # Testing: Happy Path tested |
1036 | 1098 @staticmethod |
1037 #Open file | 1099 def funcReadSelectionFileToDictionary(xInputFile): |
1038 istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim) | 1100 """ |
1039 | 1101 Reads in an output selection file from micropita and formats it into a dictionary. |
1040 #Dictionary to hold selection data | 1102 |
1041 return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader]) | 1103 :param xInputFile: String path to file or file stream to read and translate into a dictionary. |
1042 | 1104 {"method":["sample selected","sample selected"...]} |
1043 #Set up arguments reader | 1105 :type: FileStream or String Path to file |
1044 argp = argparse.ArgumentParser( prog = "MicroPITA.py", | 1106 :return Dictionary: Samples selected by methods. |
1045 description = """Selects samples from abundance tables based on various selection schemes.""" ) | 1107 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} |
1046 | 1108 """ |
1047 args = argp.add_argument_group( "Common", "Commonly modified options" ) | 1109 |
1048 args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp) | 1110 # Open file |
1049 args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp, | 1111 istmReader = csv.reader(open(xInputFile, 'r') if isinstance( |
1050 choices = ConstantsMicropita.c_lsAllMethods, action = "append") | 1112 xInputFile, str) else xInputFile, delimiter=ConstantsMicropita.c_outputFileDelim) |
1051 | 1113 |
1052 args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" ) | 1114 # Dictionary to hold selection data |
1053 args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp, choices = Metric.setAlphaDiversities) | 1115 return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader]) |
1054 args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp, choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]) | 1116 |
1055 args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp) | 1117 |
1056 args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp) | 1118 # Set up arguments reader |
1057 args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp) | 1119 argp = argparse.ArgumentParser(prog="MicroPITA.py", |
1058 args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp) | 1120 description="""Selects samples from abundance tables based on various selection schemes.""") |
1059 args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp) | 1121 |
1060 | 1122 args = argp.add_argument_group("Common", "Commonly modified options") |
1061 args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" ) | 1123 args.add_argument(ConstantsMicropita.c_strCountArgument, "--num", dest="iCount", |
1062 args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp) | 1124 metavar="samples", default=10, type=int, help=ConstantsMicropita.c_strCountHelp) |
1063 args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None, | 1125 args.add_argument("-m", "--method", dest="lstrMethods", metavar="method", default=[], help=ConstantsMicropita.c_strSelectionTechniquesHelp, |
1064 help= ConstantsMicropita.c_strLastMetadataNameHelp) | 1126 choices=ConstantsMicropita.c_lsAllMethods, action="append") |
1065 args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0], | 1127 |
1066 choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp) | 1128 args = argp.add_argument_group( |
1067 args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp) | 1129 "Custom", "Selecting and inputing custom metrics") |
1068 args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp) | 1130 args.add_argument("-a", "--alpha", dest="strAlphaDiversity", metavar="AlphaDiversity", default=None, |
1069 | 1131 help=ConstantsMicropita.c_strCustomAlphaDiversityHelp, choices=Metric.setAlphaDiversities) |
1070 args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" ) | 1132 args.add_argument("-b", "--beta", dest="strBetaDiversity", metavar="BetaDiversity", default=None, help=ConstantsMicropita.c_strCustomBetaDiversityHelp, |
1071 args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp) | 1133 choices=list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted, Metric.c_strUnifracWeighted]) |
1072 args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id", | 1134 args.add_argument("-q", "--alphameta", dest="strAlphaMetadata", metavar="AlphaDiversityMetadata", |
1073 help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp) | 1135 default=None, help=ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp) |
1074 | 1136 args.add_argument("-x", "--betamatrix", dest="istmBetaMatrix", metavar="BetaDiversityMatrix", |
1075 args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" ) | 1137 default=None, help=ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp) |
1076 args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp) | 1138 args.add_argument("-o", "--tree", dest="istrmTree", metavar="PhylogeneticTree", |
1077 args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp) | 1139 default=None, help=ConstantsMicropita.c_strCustomPhylogeneticTreeHelp) |
1078 | 1140 args.add_argument("-i", "--envr", dest="istrmEnvr", metavar="EnvironmentFile", |
1079 args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" ) | 1141 default=None, help=ConstantsMicropita.c_strCustomEnvironmentFileHelp) |
1080 args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING", | 1142 args.add_argument("-f", "--invertDiversity", dest="fInvertDiversity", action="store_true", |
1081 choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp) | 1143 default=False, help=ConstantsMicropita.c_strInvertDiversityHelp) |
1082 args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp) | 1144 |
1083 args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp) | 1145 args = argp.add_argument_group( |
1084 args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp) | 1146 "Miscellaneous", "Row/column identifiers and feature targeting options") |
1085 args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp) | 1147 args.add_argument("-d", ConstantsMicropita.c_strIDNameArgument, dest="strIDName", |
1086 | 1148 metavar="sample_id", help=ConstantsMicropita.c_strIDNameHelp) |
1087 argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp, | 1149 args.add_argument("-l", ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar="metadata_id", default=None, |
1088 default = sys.stdin) | 1150 help=ConstantsMicropita.c_strLastMetadataNameHelp) |
1089 argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp, | 1151 args.add_argument("-r", ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0], |
1090 default = sys.stdout) | 1152 choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help=ConstantsMicropita.c_strTargetedFeatureMethodHelp) |
1091 | 1153 args.add_argument("-t", ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", |
1092 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__ | 1154 metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp) |
1093 | 1155 args.add_argument("-w", ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", |
1094 def _main( ): | 1156 metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp) |
1095 args = argp.parse_args( ) | 1157 |
1096 | 1158 args = argp.add_argument_group( |
1097 #Set up logger | 1159 "Data labeling", "Metadata IDs for strata and supervised label values") |
1098 iLogLevel = getattr(logging, args.strLogLevel.upper(), None) | 1160 args.add_argument("-e", ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", |
1099 logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel) | 1161 metavar="supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp) |
1100 | 1162 args.add_argument("-s", ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id", |
1101 #Run micropita | 1163 help=ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp) |
1102 logging.info("MicroPITA:: Start microPITA") | 1164 |
1103 microPITA = MicroPITA() | 1165 args = argp.add_argument_group( |
1104 | 1166 "File formatting", "Rarely modified file formatting options") |
1105 #Argparse will append to the default but will not remove the default so I do this here | 1167 args.add_argument("-j", ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", |
1106 if not len(args.lstrMethods): | 1168 metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp) |
1107 args.lstrMethods = [ConstantsMicropita.c_strRepresentative] | 1169 args.add_argument("-k", ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", |
1108 | 1170 metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp) |
1109 dictSelectedSamples = microPITA.funcRun( | 1171 |
1110 strIDName = args.strIDName, | 1172 args = argp.add_argument_group( |
1111 strLastMetadataName = args.strLastMetadataName, | 1173 "Debugging", "Debugging options - modify at your own risk!") |
1112 istmInput = args.istmInput, | 1174 args.add_argument("-v", ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar="log_level", default="WARNING", |
1113 ostmInputPredictFile = args.ostmInputPredictFile, | 1175 choices=ConstantsMicropita.c_lsLoggingChoices, help=ConstantsMicropita.c_strLoggingHelp) |
1114 ostmPredictFile = args.ostmPredictFile, | 1176 args.add_argument("-c", ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", |
1115 ostmCheckedFile = args.ostmCheckedFile, | 1177 metavar="output_qc", type=argparse.FileType("w"), help=ConstantsMicropita.c_strCheckedAbundanceFileHelp) |
1116 ostmOutput = args.ostmOutput, | 1178 args.add_argument("-g", ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", |
1117 cDelimiter = args.cFileDelimiter, | 1179 metavar="output_log", type=argparse.FileType("w"), help=ConstantsMicropita.c_strLoggingFileHelp) |
1118 cFeatureNameDelimiter = args.cFeatureNameDelimiter, | 1180 args.add_argument("-u", ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", |
1119 istmFeatures = args.istmFeatures, | 1181 metavar="output_scaled", type=argparse.FileType("w"), help=ConstantsMicropita.c_strSupervisedInputFileHelp) |
1120 strFeatureSelection = args.strFeatureSelection, | 1182 args.add_argument("-p", ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", |
1121 iCount = args.iCount, | 1183 metavar="output_labels", type=argparse.FileType("w"), help=ConstantsMicropita.c_strSupervisedPredictedFileHelp) |
1122 strLastRowMetadata = args.strLastFeatureMetadata, | 1184 |
1123 strLabel = args.strLabel, | 1185 argp.add_argument("istmInput", metavar="input.pcl/biome", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strAbundanceFileHelp, |
1124 strStratify = args.strUnsupervisedStratify, | 1186 default=sys.stdin) |
1125 strCustomAlpha = args.strAlphaDiversity, | 1187 argp.add_argument("ostmOutput", metavar="output.txt", type=argparse.FileType("w"), help=ConstantsMicropita.c_strGenericOutputDataFileHelp, |
1126 strCustomBeta = args.strBetaDiversity, | 1188 default=sys.stdout) |
1127 strAlphaMetadata = args.strAlphaMetadata, | 1189 |
1128 istmBetaMatrix = args.istmBetaMatrix, | 1190 __doc__ = "::\n\n\t" + argp.format_help().replace("\n", "\n\t") + __doc__ |
1129 istrmTree = args.istrmTree, | 1191 |
1130 istrmEnvr = args.istrmEnvr, | 1192 |
1131 lstrMethods = args.lstrMethods, | 1193 def _main(): |
1132 fInvertDiversity = args.fInvertDiversity | 1194 args = argp.parse_args() |
1133 ) | 1195 |
1134 | 1196 # Set up logger |
1135 if not dictSelectedSamples: | 1197 iLogLevel = getattr(logging, args.strLogLevel.upper(), None) |
1136 logging.error("MicroPITA:: Error, did not get a result from analysis.") | 1198 logging.basicConfig( |
1137 return -1 | 1199 stream=args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode='w', level=iLogLevel) |
1138 logging.info("End microPITA") | 1200 |
1139 | 1201 # Run micropita |
1140 #Log output for debugging | 1202 logging.info("MicroPITA:: Start microPITA") |
1141 logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples)) | 1203 microPITA = MicroPITA() |
1142 | 1204 |
1143 #Write selection to file | 1205 # Argparse will append to the default but will not remove the default so I do this here |
1144 microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput) | 1206 if not len(args.lstrMethods): |
1207 args.lstrMethods = [ConstantsMicropita.c_strRepresentative] | |
1208 | |
1209 dictSelectedSamples = microPITA.funcRun( | |
1210 strIDName=args.strIDName, | |
1211 strLastMetadataName=args.strLastMetadataName, | |
1212 istmInput=args.istmInput, | |
1213 ostmInputPredictFile=args.ostmInputPredictFile, | |
1214 ostmPredictFile=args.ostmPredictFile, | |
1215 ostmCheckedFile=args.ostmCheckedFile, | |
1216 ostmOutput=args.ostmOutput, | |
1217 cDelimiter=args.cFileDelimiter, | |
1218 cFeatureNameDelimiter=args.cFeatureNameDelimiter, | |
1219 istmFeatures=args.istmFeatures, | |
1220 strFeatureSelection=args.strFeatureSelection, | |
1221 iCount=args.iCount, | |
1222 strLastRowMetadata=args.strLastFeatureMetadata, | |
1223 strLabel=args.strLabel, | |
1224 strStratify=args.strUnsupervisedStratify, | |
1225 strCustomAlpha=args.strAlphaDiversity, | |
1226 strCustomBeta=args.strBetaDiversity, | |
1227 strAlphaMetadata=args.strAlphaMetadata, | |
1228 istmBetaMatrix=args.istmBetaMatrix, | |
1229 istrmTree=args.istrmTree, | |
1230 istrmEnvr=args.istrmEnvr, | |
1231 lstrMethods=args.lstrMethods, | |
1232 fInvertDiversity=args.fInvertDiversity | |
1233 ) | |
1234 | |
1235 if not dictSelectedSamples: | |
1236 logging.error("MicroPITA:: Error, did not get a result from analysis.") | |
1237 return -1 | |
1238 logging.info("End microPITA") | |
1239 | |
1240 # Log output for debugging | |
1241 logging.debug("MicroPITA:: Returned the following samples:" + | |
1242 str(dictSelectedSamples)) | |
1243 | |
1244 # Write selection to file | |
1245 microPITA.funcWriteSelectionToFile( | |
1246 dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput) | |
1247 | |
1145 | 1248 |
1146 if __name__ == "__main__": | 1249 if __name__ == "__main__": |
1147 _main( ) | 1250 _main() |