Mercurial > repos > george-weingart > lefse

--- a/format_input.py	Wed Aug 27 15:54:03 2014 -0400
+++ b/format_input.py	Tue Jul 07 13:26:55 2015 -0400
@@ -1,10 +1,58 @@
 #!/usr/bin/env python

-import sys,os,argparse,pickle,re
+import sys,os,argparse,pickle,re,numpy
+
+
+

-def read_input_file(inp_file):
+#***************************************************************************************************************
+#*   Log of change                                                                                             *
+#*   January 16, 2014  - George Weingart - george.weingart@gmail.com                                           *
+#*                                                                                                             *
+#*   biom Support                                                                                              *
+#*   Modified the program to enable it to accept biom files as input                                           *
+#*                                                                                                             *
+#*   Added two optional input parameters:	                                                                   *
+#*   1. biom_c is the name of the biom metadata to be used as class                                            *
+#*   2. biom_s is the name of the biom metadata to be used as subclass                                         *
+#*   class and subclass are used in the same context as the original                                           *
+#*   parameters class and subclass                                                                             *
+#*   These parameters are totally optional, the default is the program                                         *
+#*   chooses as class the first metadata received from the conversion                                          *
+#*   of the biom file into a sequential (pcl) file as generated by                                             *
+#*   breadcrumbs, and similarly, the second metadata is selected as                                            *
+#*   subclass.                                                                                                 *
+#*   The syntax or logic for the original non-biom case was NOT changed.                                       *
+#*                                                                                                             *
+#*   <*******************  IMPORTANT NOTE   *************************>                                         *
+#*   The biom case requires breadcrumbs and therefore there is a                                               *
+#*      a conditional import of the breadcrumbs modules                                                        *
+#*   If the User uses a biom input and breadcrumbs is not detected,                                            *
+#*       the run is abnormally ended                                                                           *
+#*   breadcrumbs itself needs a biom environment, so if the immport                                            *
+#*       of biom in breadcrumbs fails,  the run is also abnormally
+#*       ended (Only if the input file was biom)                                                               *
+#*                                                                                                             *
+#*   USAGE EXAMPLES                                                                                            *
+#*   --------------                                                                                            *
+#*   Case #1: Using a sequential file as input (Old version - did not change                                   *
+#*  ./format_input.py hmp_aerobiosis_small.txt hmp_aerobiosis_small.in -c 1 -s 2 -u 3 -o 1000000               *
+#*   Case #2: Using a biom file as input                                                                       *
+#*  ./format_input.py hmp_aerobiosis_small.biom hmp_aerobiosis_small.in  -o 1000000                            *
+#*   Case #3: Using a biom file as input and override the class and subclass                                   *
+#*   ./format_input.py lefse.biom hmp_aerobiosis_small.in -biom_c oxygen_availability -biom_s body_site -o 1000000
+#*                                                                                                             *
+#***************************************************************************************************************
+
+def read_input_file(inp_file, CommonArea):
+
+	if inp_file.endswith('.biom'):   			#*  If the file format is biom:
+		CommonArea = biom_processing(inp_file)  #*  Process in biom format
+		return CommonArea 						#*  And return the CommonArea
+
 	with open(inp_file) as inp:
-		return [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()]
+		CommonArea['ReturnedData'] = [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()]
+		return CommonArea

 def transpose(data):
 	return zip(*data)
@@ -31,6 +79,11 @@
 	parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10,
 		help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)")

+	parser.add_argument('-biom_c',dest="biom_class", type=str,
+		help="For biom input files: Set which feature use as class  ")
+	parser.add_argument('-biom_s',dest="biom_subclass", type=str,
+		help="For biom input files: set which feature use as subclass   ")
+
 	args = parser.parse_args()

 	return vars(args)
@@ -136,6 +189,8 @@
 		else: mul[i] = float(norm) / m
 	for k,v in feats.items():
 		feats[k] = [val*mul[i] for i,val in enumerate(v)]
+		if numpy.mean(feats[k]) and (numpy.std(feats[k])/numpy.mean(feats[k])) < 1e-10:
+			feats[k] = [ float(round(kv*1e6)/1e6) for kv in feats[k]]
 	return feats

 def add_missing_levels2(ff):
@@ -219,15 +274,127 @@
 		if sc in toc: new_subcl.append(cl[i]+"_"+sc)
 		else: new_subcl.append(sc)
 	return new_subcl
+
+
+#*************************************************************************************
+#*  Modifications by George Weingart,  Jan 15, 2014                                  *
+#*  If the input file is biom:                                                       *
+#*  a. Load an AbundanceTable (Using breadcrumbs)                                    *
+#*  b. Create a sequential file from the AbundanceTable (de-facto - pcl)             *
+#*  c. Use that file as input to the rest of the program                             *
+#*  d. Calculate the c,s,and u parameters, either from the values the User entered   *
+#*     from the meta data values in the biom file or set up defaults                 *
+#*  <<<-------------  I M P O R T A N T     N O T E ------------------->>            *
+#*  breadcrumbs src directory must be included in the PYTHONPATH                     *
+#*  <<<-------------  I M P O R T A N T     N O T E ------------------->>            *
+#*************************************************************************************
+def biom_processing(inp_file):
+	CommonArea = dict()			#* Set up a dictionary to return
+	CommonArea['abndData']   = AbundanceTable.funcMakeFromFile(inp_file,	#* Create AbundanceTable from input biom file
+		cDelimiter = None,
+		sMetadataID = None,
+		sLastMetadataRow = None,
+		sLastMetadata = None,
+		strFormat = None)
+
+	#****************************************************************
+	#*  Building the data element here                              *
+	#****************************************************************
+	ResolvedData = list()		#This is the Resolved data that will be returned
+	IDMetadataName  = CommonArea['abndData'].funcGetIDMetadataName()   #* ID Metadataname
+	IDMetadata = [CommonArea['abndData'].funcGetIDMetadataName()]  #* The first Row
+	for IDMetadataEntry in CommonArea['abndData'].funcGetMetadataCopy()[IDMetadataName]:  #* Loop on all the metadata values
+		IDMetadata.append(IDMetadataEntry)
+	ResolvedData.append(IDMetadata)					#Add the IDMetadata with all its values to the resolved area
+	for key, value in  CommonArea['abndData'].funcGetMetadataCopy().iteritems():
+		if  key  != IDMetadataName:
+			MetadataEntry = list()		#*  Set it up
+			MetadataEntry.append(key)	#*	And post it to the area
+			for x in value:
+				MetadataEntry.append(x)		#* Append the metadata value name
+			ResolvedData.append(MetadataEntry)
+	for AbundanceDataEntry in    CommonArea['abndData'].funcGetAbundanceCopy(): 		#* The Abundance Data
+		lstAbundanceDataEntry = list(AbundanceDataEntry)	#Convert tuple to list
+		ResolvedData.append(lstAbundanceDataEntry)			#Append the list to the metadata list
+	CommonArea['ReturnedData'] = 	ResolvedData			#Post the results
+	return CommonArea
+
+
+#*******************************************************************************
+#*    Check the params and override in the case of biom                        *
+#*******************************************************************************
+def  check_params_for_biom_case(params, CommonArea):
+	CommonArea['MetadataNames'] = list()			#Metadata  names
+	params['original_class'] = params['class']			#Save the original class
+	params['original_subclass'] = params['subclass']	#Save the original subclass
+	params['original_subject'] = params['subject']	#Save the original subclass
+
+
+	TotalMetadataEntriesAndIDInBiomFile = len(CommonArea['abndData'].funcGetMetadataCopy())  # The number of metadata entries
+	for i in range(0,TotalMetadataEntriesAndIDInBiomFile): 	#* Populate the meta data names table
+		CommonArea['MetadataNames'].append(CommonArea['ReturnedData'][i][0])	#Add the metadata name
+
+
+	#****************************************************
+	#* Setting the params here                          *
+	#****************************************************
+
+	if TotalMetadataEntriesAndIDInBiomFile > 0:		#If there is at least one entry - has to be the subject
+		params['subject'] =  1
+	if TotalMetadataEntriesAndIDInBiomFile == 2:		#If there are 2 - The first is the subject and the second has to be the metadata, and that is the class
+		params['class'] =  2
+	if TotalMetadataEntriesAndIDInBiomFile == 3:		#If there are 3:  Set up default that the second entry is the class and the third is the subclass
+		params['class'] =  2
+		params['subclass'] =  3
+		FlagError = False								#Set up error flag
+
+		if not params['biom_class'] is None and not params['biom_subclass'] is None:				#Check if the User passed a valid class and subclass
+			if  params['biom_class'] in CommonArea['MetadataNames']:
+				params['class'] =  CommonArea['MetadataNames'].index(params['biom_class']) +1	#* Set up the index for that metadata
+			else:
+				FlagError = True
+			if  params['biom_subclass'] in  CommonArea['MetadataNames']:
+				params['subclass'] =  CommonArea['MetadataNames'].index(params['biom_subclass']) +1 #* Set up the index for that metadata
+			else:
+				FlagError = True
+		if FlagError == True:		#* If the User passed an invalid class
+			print "**Invalid biom class or subclass passed - Using defaults: First metadata=class, Second Metadata=subclass\n"
+			params['class'] =  2
+			params['subclass'] =  3
+	return params
+
+

 if  __name__ == '__main__':
+	CommonArea = dict()			#Build a Common Area to pass variables in the biom case
 	params = read_params(sys.argv)

+	#*************************************************************
+	#* Conditionally import breadcrumbs if file is a biom file   *
+	#* If it is and no breadcrumbs found - abnormally exit       *
+	#*************************************************************
+	if  params['input_file'].endswith('.biom'):
+		try:
+			from lefsebiom.ConstantsBreadCrumbs import *
+			from lefsebiom.AbundanceTable import *
+		except ImportError:
+			sys.stderr.write("************************************************************************************************************ \n")
+			sys.stderr.write("* Error:   Breadcrumbs libraries not detected - required to process biom files - run abnormally terminated * \n")
+			sys.stderr.write("************************************************************************************************************ \n")
+			exit(1)
+
+
 	if type(params['subclass']) is int and int(params['subclass']) < 1:
 		params['subclass'] = None
 	if type(params['subject']) is int and int(params['subject']) < 1:
 		params['subject'] = None
-	data = read_input_file(sys.argv[1])
+
+
+	CommonArea = read_input_file(sys.argv[1], CommonArea)		#Pass The CommonArea to the Read
+	data = CommonArea['ReturnedData']					#Select the data
+
+	if sys.argv[1].endswith('biom'):	#*	Check if biom:
+		params = check_params_for_biom_case(params, CommonArea)	#Check the params for the biom case

 	if params['feats_dir'] == "c":
 		data = transpose(data)