comparison format_input.py @ 20:a6284ef17bf3 draft default tip

Fixed bug due to numerical approximation after normalization affecting root-level clades (e.g. "Bacteria" or "Archaea")
author george-weingart
date Tue, 07 Jul 2015 13:26:55 -0400
parents 47ac77f2fe68
children
comparison
equal deleted inserted replaced
19:7262089c7b47 20:a6284ef17bf3
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import sys,os,argparse,pickle,re 3 import sys,os,argparse,pickle,re,numpy
4 4
5 def read_input_file(inp_file): 5
6
7
8 #***************************************************************************************************************
9 #* Log of change *
10 #* January 16, 2014 - George Weingart - george.weingart@gmail.com *
11 #* *
12 #* biom Support *
13 #* Modified the program to enable it to accept biom files as input *
14 #* *
15 #* Added two optional input parameters: *
16 #* 1. biom_c is the name of the biom metadata to be used as class *
17 #* 2. biom_s is the name of the biom metadata to be used as subclass *
18 #* class and subclass are used in the same context as the original *
19 #* parameters class and subclass *
20 #* These parameters are totally optional, the default is the program *
21 #* chooses as class the first metadata received from the conversion *
22 #* of the biom file into a sequential (pcl) file as generated by *
23 #* breadcrumbs, and similarly, the second metadata is selected as *
24 #* subclass. *
25 #* The syntax or logic for the original non-biom case was NOT changed. *
26 #* *
27 #* <******************* IMPORTANT NOTE *************************> *
28 #* The biom case requires breadcrumbs and therefore there is a *
29 #* a conditional import of the breadcrumbs modules *
30 #* If the User uses a biom input and breadcrumbs is not detected, *
31 #* the run is abnormally ended *
32 #* breadcrumbs itself needs a biom environment, so if the immport *
33 #* of biom in breadcrumbs fails, the run is also abnormally
34 #* ended (Only if the input file was biom) *
35 #* *
36 #* USAGE EXAMPLES *
37 #* -------------- *
38 #* Case #1: Using a sequential file as input (Old version - did not change *
39 #* ./format_input.py hmp_aerobiosis_small.txt hmp_aerobiosis_small.in -c 1 -s 2 -u 3 -o 1000000 *
40 #* Case #2: Using a biom file as input *
41 #* ./format_input.py hmp_aerobiosis_small.biom hmp_aerobiosis_small.in -o 1000000 *
42 #* Case #3: Using a biom file as input and override the class and subclass *
43 #* ./format_input.py lefse.biom hmp_aerobiosis_small.in -biom_c oxygen_availability -biom_s body_site -o 1000000
44 #* *
45 #***************************************************************************************************************
46
47 def read_input_file(inp_file, CommonArea):
48
49 if inp_file.endswith('.biom'): #* If the file format is biom:
50 CommonArea = biom_processing(inp_file) #* Process in biom format
51 return CommonArea #* And return the CommonArea
52
6 with open(inp_file) as inp: 53 with open(inp_file) as inp:
7 return [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()] 54 CommonArea['ReturnedData'] = [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()]
55 return CommonArea
8 56
9 def transpose(data): 57 def transpose(data):
10 return zip(*data) 58 return zip(*data)
11 59
12 def read_params(args): 60 def read_params(args):
29 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d", 77 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d",
30 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)") 78 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)")
31 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10, 79 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10,
32 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)") 80 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)")
33 81
82 parser.add_argument('-biom_c',dest="biom_class", type=str,
83 help="For biom input files: Set which feature use as class ")
84 parser.add_argument('-biom_s',dest="biom_subclass", type=str,
85 help="For biom input files: set which feature use as subclass ")
86
34 args = parser.parse_args() 87 args = parser.parse_args()
35 88
36 return vars(args) 89 return vars(args)
37 90
38 def remove_missing(data,roc): 91 def remove_missing(data,roc):
134 for i,m in enumerate(mul): 187 for i,m in enumerate(mul):
135 if m == 0: mul[i] = 0.0 188 if m == 0: mul[i] = 0.0
136 else: mul[i] = float(norm) / m 189 else: mul[i] = float(norm) / m
137 for k,v in feats.items(): 190 for k,v in feats.items():
138 feats[k] = [val*mul[i] for i,val in enumerate(v)] 191 feats[k] = [val*mul[i] for i,val in enumerate(v)]
192 if numpy.mean(feats[k]) and (numpy.std(feats[k])/numpy.mean(feats[k])) < 1e-10:
193 feats[k] = [ float(round(kv*1e6)/1e6) for kv in feats[k]]
139 return feats 194 return feats
140 195
141 def add_missing_levels2(ff): 196 def add_missing_levels2(ff):
142 197
143 if sum( [f.count(".") for f in ff] ) < 1: return ff 198 if sum( [f.count(".") for f in ff] ) < 1: return ff
217 new_subcl = [] 272 new_subcl = []
218 for i,sc in enumerate(subcl): 273 for i,sc in enumerate(subcl):
219 if sc in toc: new_subcl.append(cl[i]+"_"+sc) 274 if sc in toc: new_subcl.append(cl[i]+"_"+sc)
220 else: new_subcl.append(sc) 275 else: new_subcl.append(sc)
221 return new_subcl 276 return new_subcl
277
278
279 #*************************************************************************************
280 #* Modifications by George Weingart, Jan 15, 2014 *
281 #* If the input file is biom: *
282 #* a. Load an AbundanceTable (Using breadcrumbs) *
283 #* b. Create a sequential file from the AbundanceTable (de-facto - pcl) *
284 #* c. Use that file as input to the rest of the program *
285 #* d. Calculate the c,s,and u parameters, either from the values the User entered *
286 #* from the meta data values in the biom file or set up defaults *
287 #* <<<------------- I M P O R T A N T N O T E ------------------->> *
288 #* breadcrumbs src directory must be included in the PYTHONPATH *
289 #* <<<------------- I M P O R T A N T N O T E ------------------->> *
290 #*************************************************************************************
291 def biom_processing(inp_file):
292 CommonArea = dict() #* Set up a dictionary to return
293 CommonArea['abndData'] = AbundanceTable.funcMakeFromFile(inp_file, #* Create AbundanceTable from input biom file
294 cDelimiter = None,
295 sMetadataID = None,
296 sLastMetadataRow = None,
297 sLastMetadata = None,
298 strFormat = None)
299
300 #****************************************************************
301 #* Building the data element here *
302 #****************************************************************
303 ResolvedData = list() #This is the Resolved data that will be returned
304 IDMetadataName = CommonArea['abndData'].funcGetIDMetadataName() #* ID Metadataname
305 IDMetadata = [CommonArea['abndData'].funcGetIDMetadataName()] #* The first Row
306 for IDMetadataEntry in CommonArea['abndData'].funcGetMetadataCopy()[IDMetadataName]: #* Loop on all the metadata values
307 IDMetadata.append(IDMetadataEntry)
308 ResolvedData.append(IDMetadata) #Add the IDMetadata with all its values to the resolved area
309 for key, value in CommonArea['abndData'].funcGetMetadataCopy().iteritems():
310 if key != IDMetadataName:
311 MetadataEntry = list() #* Set it up
312 MetadataEntry.append(key) #* And post it to the area
313 for x in value:
314 MetadataEntry.append(x) #* Append the metadata value name
315 ResolvedData.append(MetadataEntry)
316 for AbundanceDataEntry in CommonArea['abndData'].funcGetAbundanceCopy(): #* The Abundance Data
317 lstAbundanceDataEntry = list(AbundanceDataEntry) #Convert tuple to list
318 ResolvedData.append(lstAbundanceDataEntry) #Append the list to the metadata list
319 CommonArea['ReturnedData'] = ResolvedData #Post the results
320 return CommonArea
321
322
323 #*******************************************************************************
324 #* Check the params and override in the case of biom *
325 #*******************************************************************************
326 def check_params_for_biom_case(params, CommonArea):
327 CommonArea['MetadataNames'] = list() #Metadata names
328 params['original_class'] = params['class'] #Save the original class
329 params['original_subclass'] = params['subclass'] #Save the original subclass
330 params['original_subject'] = params['subject'] #Save the original subclass
331
332
333 TotalMetadataEntriesAndIDInBiomFile = len(CommonArea['abndData'].funcGetMetadataCopy()) # The number of metadata entries
334 for i in range(0,TotalMetadataEntriesAndIDInBiomFile): #* Populate the meta data names table
335 CommonArea['MetadataNames'].append(CommonArea['ReturnedData'][i][0]) #Add the metadata name
336
337
338 #****************************************************
339 #* Setting the params here *
340 #****************************************************
341
342 if TotalMetadataEntriesAndIDInBiomFile > 0: #If there is at least one entry - has to be the subject
343 params['subject'] = 1
344 if TotalMetadataEntriesAndIDInBiomFile == 2: #If there are 2 - The first is the subject and the second has to be the metadata, and that is the class
345 params['class'] = 2
346 if TotalMetadataEntriesAndIDInBiomFile == 3: #If there are 3: Set up default that the second entry is the class and the third is the subclass
347 params['class'] = 2
348 params['subclass'] = 3
349 FlagError = False #Set up error flag
350
351 if not params['biom_class'] is None and not params['biom_subclass'] is None: #Check if the User passed a valid class and subclass
352 if params['biom_class'] in CommonArea['MetadataNames']:
353 params['class'] = CommonArea['MetadataNames'].index(params['biom_class']) +1 #* Set up the index for that metadata
354 else:
355 FlagError = True
356 if params['biom_subclass'] in CommonArea['MetadataNames']:
357 params['subclass'] = CommonArea['MetadataNames'].index(params['biom_subclass']) +1 #* Set up the index for that metadata
358 else:
359 FlagError = True
360 if FlagError == True: #* If the User passed an invalid class
361 print "**Invalid biom class or subclass passed - Using defaults: First metadata=class, Second Metadata=subclass\n"
362 params['class'] = 2
363 params['subclass'] = 3
364 return params
365
366
222 367
223 if __name__ == '__main__': 368 if __name__ == '__main__':
369 CommonArea = dict() #Build a Common Area to pass variables in the biom case
224 params = read_params(sys.argv) 370 params = read_params(sys.argv)
225 371
372 #*************************************************************
373 #* Conditionally import breadcrumbs if file is a biom file *
374 #* If it is and no breadcrumbs found - abnormally exit *
375 #*************************************************************
376 if params['input_file'].endswith('.biom'):
377 try:
378 from lefsebiom.ConstantsBreadCrumbs import *
379 from lefsebiom.AbundanceTable import *
380 except ImportError:
381 sys.stderr.write("************************************************************************************************************ \n")
382 sys.stderr.write("* Error: Breadcrumbs libraries not detected - required to process biom files - run abnormally terminated * \n")
383 sys.stderr.write("************************************************************************************************************ \n")
384 exit(1)
385
386
226 if type(params['subclass']) is int and int(params['subclass']) < 1: 387 if type(params['subclass']) is int and int(params['subclass']) < 1:
227 params['subclass'] = None 388 params['subclass'] = None
228 if type(params['subject']) is int and int(params['subject']) < 1: 389 if type(params['subject']) is int and int(params['subject']) < 1:
229 params['subject'] = None 390 params['subject'] = None
230 data = read_input_file(sys.argv[1]) 391
392
393 CommonArea = read_input_file(sys.argv[1], CommonArea) #Pass The CommonArea to the Read
394 data = CommonArea['ReturnedData'] #Select the data
395
396 if sys.argv[1].endswith('biom'): #* Check if biom:
397 params = check_params_for_biom_case(params, CommonArea) #Check the params for the biom case
231 398
232 if params['feats_dir'] == "c": 399 if params['feats_dir'] == "c":
233 data = transpose(data) 400 data = transpose(data)
234 401
235 ncl = 1 402 ncl = 1