Mercurial > repos > george-weingart > lefse
comparison format_input.py @ 20:a6284ef17bf3 draft default tip
Fixed bug due to numerical approximation after normalization affecting root-level clades (e.g. "Bacteria" or "Archaea")
| author | george-weingart |
|---|---|
| date | Tue, 07 Jul 2015 13:26:55 -0400 |
| parents | 47ac77f2fe68 |
| children |
comparison
equal
deleted
inserted
replaced
| 19:7262089c7b47 | 20:a6284ef17bf3 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 import sys,os,argparse,pickle,re | 3 import sys,os,argparse,pickle,re,numpy |
| 4 | 4 |
| 5 def read_input_file(inp_file): | 5 |
| 6 | |
| 7 | |
| 8 #*************************************************************************************************************** | |
| 9 #* Log of change * | |
| 10 #* January 16, 2014 - George Weingart - george.weingart@gmail.com * | |
| 11 #* * | |
| 12 #* biom Support * | |
| 13 #* Modified the program to enable it to accept biom files as input * | |
| 14 #* * | |
| 15 #* Added two optional input parameters: * | |
| 16 #* 1. biom_c is the name of the biom metadata to be used as class * | |
| 17 #* 2. biom_s is the name of the biom metadata to be used as subclass * | |
| 18 #* class and subclass are used in the same context as the original * | |
| 19 #* parameters class and subclass * | |
| 20 #* These parameters are totally optional, the default is the program * | |
| 21 #* chooses as class the first metadata received from the conversion * | |
| 22 #* of the biom file into a sequential (pcl) file as generated by * | |
| 23 #* breadcrumbs, and similarly, the second metadata is selected as * | |
| 24 #* subclass. * | |
| 25 #* The syntax or logic for the original non-biom case was NOT changed. * | |
| 26 #* * | |
| 27 #* <******************* IMPORTANT NOTE *************************> * | |
| 28 #* The biom case requires breadcrumbs and therefore there is a * | |
| 29 #* a conditional import of the breadcrumbs modules * | |
| 30 #* If the User uses a biom input and breadcrumbs is not detected, * | |
| 31 #* the run is abnormally ended * | |
| 32 #* breadcrumbs itself needs a biom environment, so if the immport * | |
| 33 #* of biom in breadcrumbs fails, the run is also abnormally | |
| 34 #* ended (Only if the input file was biom) * | |
| 35 #* * | |
| 36 #* USAGE EXAMPLES * | |
| 37 #* -------------- * | |
| 38 #* Case #1: Using a sequential file as input (Old version - did not change * | |
| 39 #* ./format_input.py hmp_aerobiosis_small.txt hmp_aerobiosis_small.in -c 1 -s 2 -u 3 -o 1000000 * | |
| 40 #* Case #2: Using a biom file as input * | |
| 41 #* ./format_input.py hmp_aerobiosis_small.biom hmp_aerobiosis_small.in -o 1000000 * | |
| 42 #* Case #3: Using a biom file as input and override the class and subclass * | |
| 43 #* ./format_input.py lefse.biom hmp_aerobiosis_small.in -biom_c oxygen_availability -biom_s body_site -o 1000000 | |
| 44 #* * | |
| 45 #*************************************************************************************************************** | |
| 46 | |
| 47 def read_input_file(inp_file, CommonArea): | |
| 48 | |
| 49 if inp_file.endswith('.biom'): #* If the file format is biom: | |
| 50 CommonArea = biom_processing(inp_file) #* Process in biom format | |
| 51 return CommonArea #* And return the CommonArea | |
| 52 | |
| 6 with open(inp_file) as inp: | 53 with open(inp_file) as inp: |
| 7 return [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()] | 54 CommonArea['ReturnedData'] = [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()] |
| 55 return CommonArea | |
| 8 | 56 |
| 9 def transpose(data): | 57 def transpose(data): |
| 10 return zip(*data) | 58 return zip(*data) |
| 11 | 59 |
| 12 def read_params(args): | 60 def read_params(args): |
| 29 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d", | 77 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d", |
| 30 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)") | 78 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)") |
| 31 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10, | 79 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10, |
| 32 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)") | 80 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)") |
| 33 | 81 |
| 82 parser.add_argument('-biom_c',dest="biom_class", type=str, | |
| 83 help="For biom input files: Set which feature use as class ") | |
| 84 parser.add_argument('-biom_s',dest="biom_subclass", type=str, | |
| 85 help="For biom input files: set which feature use as subclass ") | |
| 86 | |
| 34 args = parser.parse_args() | 87 args = parser.parse_args() |
| 35 | 88 |
| 36 return vars(args) | 89 return vars(args) |
| 37 | 90 |
| 38 def remove_missing(data,roc): | 91 def remove_missing(data,roc): |
| 134 for i,m in enumerate(mul): | 187 for i,m in enumerate(mul): |
| 135 if m == 0: mul[i] = 0.0 | 188 if m == 0: mul[i] = 0.0 |
| 136 else: mul[i] = float(norm) / m | 189 else: mul[i] = float(norm) / m |
| 137 for k,v in feats.items(): | 190 for k,v in feats.items(): |
| 138 feats[k] = [val*mul[i] for i,val in enumerate(v)] | 191 feats[k] = [val*mul[i] for i,val in enumerate(v)] |
| 192 if numpy.mean(feats[k]) and (numpy.std(feats[k])/numpy.mean(feats[k])) < 1e-10: | |
| 193 feats[k] = [ float(round(kv*1e6)/1e6) for kv in feats[k]] | |
| 139 return feats | 194 return feats |
| 140 | 195 |
| 141 def add_missing_levels2(ff): | 196 def add_missing_levels2(ff): |
| 142 | 197 |
| 143 if sum( [f.count(".") for f in ff] ) < 1: return ff | 198 if sum( [f.count(".") for f in ff] ) < 1: return ff |
| 217 new_subcl = [] | 272 new_subcl = [] |
| 218 for i,sc in enumerate(subcl): | 273 for i,sc in enumerate(subcl): |
| 219 if sc in toc: new_subcl.append(cl[i]+"_"+sc) | 274 if sc in toc: new_subcl.append(cl[i]+"_"+sc) |
| 220 else: new_subcl.append(sc) | 275 else: new_subcl.append(sc) |
| 221 return new_subcl | 276 return new_subcl |
| 277 | |
| 278 | |
| 279 #************************************************************************************* | |
| 280 #* Modifications by George Weingart, Jan 15, 2014 * | |
| 281 #* If the input file is biom: * | |
| 282 #* a. Load an AbundanceTable (Using breadcrumbs) * | |
| 283 #* b. Create a sequential file from the AbundanceTable (de-facto - pcl) * | |
| 284 #* c. Use that file as input to the rest of the program * | |
| 285 #* d. Calculate the c,s,and u parameters, either from the values the User entered * | |
| 286 #* from the meta data values in the biom file or set up defaults * | |
| 287 #* <<<------------- I M P O R T A N T N O T E ------------------->> * | |
| 288 #* breadcrumbs src directory must be included in the PYTHONPATH * | |
| 289 #* <<<------------- I M P O R T A N T N O T E ------------------->> * | |
| 290 #************************************************************************************* | |
| 291 def biom_processing(inp_file): | |
| 292 CommonArea = dict() #* Set up a dictionary to return | |
| 293 CommonArea['abndData'] = AbundanceTable.funcMakeFromFile(inp_file, #* Create AbundanceTable from input biom file | |
| 294 cDelimiter = None, | |
| 295 sMetadataID = None, | |
| 296 sLastMetadataRow = None, | |
| 297 sLastMetadata = None, | |
| 298 strFormat = None) | |
| 299 | |
| 300 #**************************************************************** | |
| 301 #* Building the data element here * | |
| 302 #**************************************************************** | |
| 303 ResolvedData = list() #This is the Resolved data that will be returned | |
| 304 IDMetadataName = CommonArea['abndData'].funcGetIDMetadataName() #* ID Metadataname | |
| 305 IDMetadata = [CommonArea['abndData'].funcGetIDMetadataName()] #* The first Row | |
| 306 for IDMetadataEntry in CommonArea['abndData'].funcGetMetadataCopy()[IDMetadataName]: #* Loop on all the metadata values | |
| 307 IDMetadata.append(IDMetadataEntry) | |
| 308 ResolvedData.append(IDMetadata) #Add the IDMetadata with all its values to the resolved area | |
| 309 for key, value in CommonArea['abndData'].funcGetMetadataCopy().iteritems(): | |
| 310 if key != IDMetadataName: | |
| 311 MetadataEntry = list() #* Set it up | |
| 312 MetadataEntry.append(key) #* And post it to the area | |
| 313 for x in value: | |
| 314 MetadataEntry.append(x) #* Append the metadata value name | |
| 315 ResolvedData.append(MetadataEntry) | |
| 316 for AbundanceDataEntry in CommonArea['abndData'].funcGetAbundanceCopy(): #* The Abundance Data | |
| 317 lstAbundanceDataEntry = list(AbundanceDataEntry) #Convert tuple to list | |
| 318 ResolvedData.append(lstAbundanceDataEntry) #Append the list to the metadata list | |
| 319 CommonArea['ReturnedData'] = ResolvedData #Post the results | |
| 320 return CommonArea | |
| 321 | |
| 322 | |
| 323 #******************************************************************************* | |
| 324 #* Check the params and override in the case of biom * | |
| 325 #******************************************************************************* | |
| 326 def check_params_for_biom_case(params, CommonArea): | |
| 327 CommonArea['MetadataNames'] = list() #Metadata names | |
| 328 params['original_class'] = params['class'] #Save the original class | |
| 329 params['original_subclass'] = params['subclass'] #Save the original subclass | |
| 330 params['original_subject'] = params['subject'] #Save the original subclass | |
| 331 | |
| 332 | |
| 333 TotalMetadataEntriesAndIDInBiomFile = len(CommonArea['abndData'].funcGetMetadataCopy()) # The number of metadata entries | |
| 334 for i in range(0,TotalMetadataEntriesAndIDInBiomFile): #* Populate the meta data names table | |
| 335 CommonArea['MetadataNames'].append(CommonArea['ReturnedData'][i][0]) #Add the metadata name | |
| 336 | |
| 337 | |
| 338 #**************************************************** | |
| 339 #* Setting the params here * | |
| 340 #**************************************************** | |
| 341 | |
| 342 if TotalMetadataEntriesAndIDInBiomFile > 0: #If there is at least one entry - has to be the subject | |
| 343 params['subject'] = 1 | |
| 344 if TotalMetadataEntriesAndIDInBiomFile == 2: #If there are 2 - The first is the subject and the second has to be the metadata, and that is the class | |
| 345 params['class'] = 2 | |
| 346 if TotalMetadataEntriesAndIDInBiomFile == 3: #If there are 3: Set up default that the second entry is the class and the third is the subclass | |
| 347 params['class'] = 2 | |
| 348 params['subclass'] = 3 | |
| 349 FlagError = False #Set up error flag | |
| 350 | |
| 351 if not params['biom_class'] is None and not params['biom_subclass'] is None: #Check if the User passed a valid class and subclass | |
| 352 if params['biom_class'] in CommonArea['MetadataNames']: | |
| 353 params['class'] = CommonArea['MetadataNames'].index(params['biom_class']) +1 #* Set up the index for that metadata | |
| 354 else: | |
| 355 FlagError = True | |
| 356 if params['biom_subclass'] in CommonArea['MetadataNames']: | |
| 357 params['subclass'] = CommonArea['MetadataNames'].index(params['biom_subclass']) +1 #* Set up the index for that metadata | |
| 358 else: | |
| 359 FlagError = True | |
| 360 if FlagError == True: #* If the User passed an invalid class | |
| 361 print "**Invalid biom class or subclass passed - Using defaults: First metadata=class, Second Metadata=subclass\n" | |
| 362 params['class'] = 2 | |
| 363 params['subclass'] = 3 | |
| 364 return params | |
| 365 | |
| 366 | |
| 222 | 367 |
| 223 if __name__ == '__main__': | 368 if __name__ == '__main__': |
| 369 CommonArea = dict() #Build a Common Area to pass variables in the biom case | |
| 224 params = read_params(sys.argv) | 370 params = read_params(sys.argv) |
| 225 | 371 |
| 372 #************************************************************* | |
| 373 #* Conditionally import breadcrumbs if file is a biom file * | |
| 374 #* If it is and no breadcrumbs found - abnormally exit * | |
| 375 #************************************************************* | |
| 376 if params['input_file'].endswith('.biom'): | |
| 377 try: | |
| 378 from lefsebiom.ConstantsBreadCrumbs import * | |
| 379 from lefsebiom.AbundanceTable import * | |
| 380 except ImportError: | |
| 381 sys.stderr.write("************************************************************************************************************ \n") | |
| 382 sys.stderr.write("* Error: Breadcrumbs libraries not detected - required to process biom files - run abnormally terminated * \n") | |
| 383 sys.stderr.write("************************************************************************************************************ \n") | |
| 384 exit(1) | |
| 385 | |
| 386 | |
| 226 if type(params['subclass']) is int and int(params['subclass']) < 1: | 387 if type(params['subclass']) is int and int(params['subclass']) < 1: |
| 227 params['subclass'] = None | 388 params['subclass'] = None |
| 228 if type(params['subject']) is int and int(params['subject']) < 1: | 389 if type(params['subject']) is int and int(params['subject']) < 1: |
| 229 params['subject'] = None | 390 params['subject'] = None |
| 230 data = read_input_file(sys.argv[1]) | 391 |
| 392 | |
| 393 CommonArea = read_input_file(sys.argv[1], CommonArea) #Pass The CommonArea to the Read | |
| 394 data = CommonArea['ReturnedData'] #Select the data | |
| 395 | |
| 396 if sys.argv[1].endswith('biom'): #* Check if biom: | |
| 397 params = check_params_for_biom_case(params, CommonArea) #Check the params for the biom case | |
| 231 | 398 |
| 232 if params['feats_dir'] == "c": | 399 if params['feats_dir'] == "c": |
| 233 data = transpose(data) | 400 data = transpose(data) |
| 234 | 401 |
| 235 ncl = 1 | 402 ncl = 1 |
