Mercurial > repos > george-weingart > lefse
changeset 20:a6284ef17bf3 draft default tip
Fixed bug due to numerical approximation after normalization affecting root-level clades (e.g. "Bacteria" or "Archaea")
author | george-weingart |
---|---|
date | Tue, 07 Jul 2015 13:26:55 -0400 |
parents | 7262089c7b47 |
children | |
files | format_input.py |
diffstat | 1 files changed, 171 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/format_input.py Wed Aug 27 15:54:03 2014 -0400 +++ b/format_input.py Tue Jul 07 13:26:55 2015 -0400 @@ -1,10 +1,58 @@ #!/usr/bin/env python -import sys,os,argparse,pickle,re +import sys,os,argparse,pickle,re,numpy + + + -def read_input_file(inp_file): +#*************************************************************************************************************** +#* Log of change * +#* January 16, 2014 - George Weingart - george.weingart@gmail.com * +#* * +#* biom Support * +#* Modified the program to enable it to accept biom files as input * +#* * +#* Added two optional input parameters: * +#* 1. biom_c is the name of the biom metadata to be used as class * +#* 2. biom_s is the name of the biom metadata to be used as subclass * +#* class and subclass are used in the same context as the original * +#* parameters class and subclass * +#* These parameters are totally optional, the default is the program * +#* chooses as class the first metadata received from the conversion * +#* of the biom file into a sequential (pcl) file as generated by * +#* breadcrumbs, and similarly, the second metadata is selected as * +#* subclass. * +#* The syntax or logic for the original non-biom case was NOT changed. * +#* * +#* <******************* IMPORTANT NOTE *************************> * +#* The biom case requires breadcrumbs and therefore there is a * +#* a conditional import of the breadcrumbs modules * +#* If the User uses a biom input and breadcrumbs is not detected, * +#* the run is abnormally ended * +#* breadcrumbs itself needs a biom environment, so if the immport * +#* of biom in breadcrumbs fails, the run is also abnormally +#* ended (Only if the input file was biom) * +#* * +#* USAGE EXAMPLES * +#* -------------- * +#* Case #1: Using a sequential file as input (Old version - did not change * +#* ./format_input.py hmp_aerobiosis_small.txt hmp_aerobiosis_small.in -c 1 -s 2 -u 3 -o 1000000 * +#* Case #2: Using a biom file as input * +#* ./format_input.py hmp_aerobiosis_small.biom hmp_aerobiosis_small.in -o 1000000 * +#* Case #3: Using a biom file as input and override the class and subclass * +#* ./format_input.py lefse.biom hmp_aerobiosis_small.in -biom_c oxygen_availability -biom_s body_site -o 1000000 +#* * +#*************************************************************************************************************** + +def read_input_file(inp_file, CommonArea): + + if inp_file.endswith('.biom'): #* If the file format is biom: + CommonArea = biom_processing(inp_file) #* Process in biom format + return CommonArea #* And return the CommonArea + with open(inp_file) as inp: - return [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()] + CommonArea['ReturnedData'] = [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()] + return CommonArea def transpose(data): return zip(*data) @@ -31,6 +79,11 @@ parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10, help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)") + parser.add_argument('-biom_c',dest="biom_class", type=str, + help="For biom input files: Set which feature use as class ") + parser.add_argument('-biom_s',dest="biom_subclass", type=str, + help="For biom input files: set which feature use as subclass ") + args = parser.parse_args() return vars(args) @@ -136,6 +189,8 @@ else: mul[i] = float(norm) / m for k,v in feats.items(): feats[k] = [val*mul[i] for i,val in enumerate(v)] + if numpy.mean(feats[k]) and (numpy.std(feats[k])/numpy.mean(feats[k])) < 1e-10: + feats[k] = [ float(round(kv*1e6)/1e6) for kv in feats[k]] return feats def add_missing_levels2(ff): @@ -219,15 +274,127 @@ if sc in toc: new_subcl.append(cl[i]+"_"+sc) else: new_subcl.append(sc) return new_subcl + + +#************************************************************************************* +#* Modifications by George Weingart, Jan 15, 2014 * +#* If the input file is biom: * +#* a. Load an AbundanceTable (Using breadcrumbs) * +#* b. Create a sequential file from the AbundanceTable (de-facto - pcl) * +#* c. Use that file as input to the rest of the program * +#* d. Calculate the c,s,and u parameters, either from the values the User entered * +#* from the meta data values in the biom file or set up defaults * +#* <<<------------- I M P O R T A N T N O T E ------------------->> * +#* breadcrumbs src directory must be included in the PYTHONPATH * +#* <<<------------- I M P O R T A N T N O T E ------------------->> * +#************************************************************************************* +def biom_processing(inp_file): + CommonArea = dict() #* Set up a dictionary to return + CommonArea['abndData'] = AbundanceTable.funcMakeFromFile(inp_file, #* Create AbundanceTable from input biom file + cDelimiter = None, + sMetadataID = None, + sLastMetadataRow = None, + sLastMetadata = None, + strFormat = None) + + #**************************************************************** + #* Building the data element here * + #**************************************************************** + ResolvedData = list() #This is the Resolved data that will be returned + IDMetadataName = CommonArea['abndData'].funcGetIDMetadataName() #* ID Metadataname + IDMetadata = [CommonArea['abndData'].funcGetIDMetadataName()] #* The first Row + for IDMetadataEntry in CommonArea['abndData'].funcGetMetadataCopy()[IDMetadataName]: #* Loop on all the metadata values + IDMetadata.append(IDMetadataEntry) + ResolvedData.append(IDMetadata) #Add the IDMetadata with all its values to the resolved area + for key, value in CommonArea['abndData'].funcGetMetadataCopy().iteritems(): + if key != IDMetadataName: + MetadataEntry = list() #* Set it up + MetadataEntry.append(key) #* And post it to the area + for x in value: + MetadataEntry.append(x) #* Append the metadata value name + ResolvedData.append(MetadataEntry) + for AbundanceDataEntry in CommonArea['abndData'].funcGetAbundanceCopy(): #* The Abundance Data + lstAbundanceDataEntry = list(AbundanceDataEntry) #Convert tuple to list + ResolvedData.append(lstAbundanceDataEntry) #Append the list to the metadata list + CommonArea['ReturnedData'] = ResolvedData #Post the results + return CommonArea + + +#******************************************************************************* +#* Check the params and override in the case of biom * +#******************************************************************************* +def check_params_for_biom_case(params, CommonArea): + CommonArea['MetadataNames'] = list() #Metadata names + params['original_class'] = params['class'] #Save the original class + params['original_subclass'] = params['subclass'] #Save the original subclass + params['original_subject'] = params['subject'] #Save the original subclass + + + TotalMetadataEntriesAndIDInBiomFile = len(CommonArea['abndData'].funcGetMetadataCopy()) # The number of metadata entries + for i in range(0,TotalMetadataEntriesAndIDInBiomFile): #* Populate the meta data names table + CommonArea['MetadataNames'].append(CommonArea['ReturnedData'][i][0]) #Add the metadata name + + + #**************************************************** + #* Setting the params here * + #**************************************************** + + if TotalMetadataEntriesAndIDInBiomFile > 0: #If there is at least one entry - has to be the subject + params['subject'] = 1 + if TotalMetadataEntriesAndIDInBiomFile == 2: #If there are 2 - The first is the subject and the second has to be the metadata, and that is the class + params['class'] = 2 + if TotalMetadataEntriesAndIDInBiomFile == 3: #If there are 3: Set up default that the second entry is the class and the third is the subclass + params['class'] = 2 + params['subclass'] = 3 + FlagError = False #Set up error flag + + if not params['biom_class'] is None and not params['biom_subclass'] is None: #Check if the User passed a valid class and subclass + if params['biom_class'] in CommonArea['MetadataNames']: + params['class'] = CommonArea['MetadataNames'].index(params['biom_class']) +1 #* Set up the index for that metadata + else: + FlagError = True + if params['biom_subclass'] in CommonArea['MetadataNames']: + params['subclass'] = CommonArea['MetadataNames'].index(params['biom_subclass']) +1 #* Set up the index for that metadata + else: + FlagError = True + if FlagError == True: #* If the User passed an invalid class + print "**Invalid biom class or subclass passed - Using defaults: First metadata=class, Second Metadata=subclass\n" + params['class'] = 2 + params['subclass'] = 3 + return params + + if __name__ == '__main__': + CommonArea = dict() #Build a Common Area to pass variables in the biom case params = read_params(sys.argv) + #************************************************************* + #* Conditionally import breadcrumbs if file is a biom file * + #* If it is and no breadcrumbs found - abnormally exit * + #************************************************************* + if params['input_file'].endswith('.biom'): + try: + from lefsebiom.ConstantsBreadCrumbs import * + from lefsebiom.AbundanceTable import * + except ImportError: + sys.stderr.write("************************************************************************************************************ \n") + sys.stderr.write("* Error: Breadcrumbs libraries not detected - required to process biom files - run abnormally terminated * \n") + sys.stderr.write("************************************************************************************************************ \n") + exit(1) + + if type(params['subclass']) is int and int(params['subclass']) < 1: params['subclass'] = None if type(params['subject']) is int and int(params['subject']) < 1: params['subject'] = None - data = read_input_file(sys.argv[1]) + + + CommonArea = read_input_file(sys.argv[1], CommonArea) #Pass The CommonArea to the Read + data = CommonArea['ReturnedData'] #Select the data + + if sys.argv[1].endswith('biom'): #* Check if biom: + params = check_params_for_biom_case(params, CommonArea) #Check the params for the biom case if params['feats_dir'] == "c": data = transpose(data)