Mercurial > repos > george-weingart > graphlan_import
diff merge_metaphlan_tables.py @ 0:cac6247cb1d3 draft
graphlan_import
author | george-weingart |
---|---|
date | Tue, 26 Aug 2014 14:51:29 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_metaphlan_tables.py Tue Aug 26 14:51:29 2014 -0400 @@ -0,0 +1,103 @@ +#!/usr/bin/env python + +# ============================================================================== +# Merge script: from MetaPhlAn output on single sample to a joined "clades vs samples" table +# Authors: Timothy Tickle (ttickle@hsph.harvard.edu) and Curtis Huttenhower (chuttenh@hsph.harvard.edu) +# ============================================================================== + +import argparse +import csv +import os +import sys + + +def merge( aaastrIn, astrLabels, iCol, ostm ): + """ + Outputs the table join of the given pre-split string collection. + + :param aaastrIn: One or more split lines from which data are read. + :type aaastrIn: collection of collections of string collections + :param astrLabels: File names of input data. + :type astrLabels: collection of strings + :param iCol: Data column in which IDs are matched (zero-indexed). + :type iCol: int + :param ostm: Output stream to which matched rows are written. + :type ostm: output stream + + """ + + setstrIDs = set() + """The final set of all IDs in any table.""" + ahashIDs = [{} for i in range( len( aaastrIn ) )] + """One hash of IDs to row numbers for each input datum.""" + aaastrData = [[] for i in range( len( aaastrIn ) )] + """One data table for each input datum.""" + aastrHeaders = [[] for i in range( len( aaastrIn ) )] + """The list of non-ID headers for each input datum.""" + strHeader = "ID" + """The ID column header.""" + + # For each input datum in each input stream... + pos = 0 + + for f in aaastrIn : + with open(f) as csvfile : + iIn = csv.reader(csvfile, csv.excel_tab) + + # Lines from the current file, empty list to hold data, empty hash to hold ids + aastrData, hashIDs = (a[pos] for a in (aaastrData, ahashIDs)) + + iLine = -1 + # For a line in the file + for astrLine in iIn: + iLine += 1 + + # ID is from first column, data are everything else + strID, astrData = astrLine[iCol], ( astrLine[:iCol] + astrLine[( iCol + 1 ):] ) + + hashIDs[strID] = iLine + aastrData.append( astrData ) + + # Batch merge every new ID key set + setstrIDs.update( hashIDs.keys( ) ) + + pos += 1 + + # Create writer + csvw = csv.writer( ostm, csv.excel_tab, lineterminator='\n' ) + + # Make the file names the column names + csvw.writerow( [strHeader] + [os.path.splitext(f)[0] for f in astrLabels] ) + + # Write out data + for strID in sorted( setstrIDs ): + astrOut = [] + for iIn in range( len( aaastrIn ) ): + aastrData, hashIDs = (a[iIn] for a in (aaastrData, ahashIDs)) + # Look up the row number of the current ID in the current dataset, if any + iID = hashIDs.get( strID ) + # If not, start with no data; if yes, pull out stored data row + astrData = [0.0] if ( iID == None ) else aastrData[iID] + # Pad output data as needed + astrData += [None] * ( len( aastrHeaders[iIn] ) - len( astrData ) ) + astrOut += astrData + csvw.writerow( [strID] + astrOut ) + + +argp = argparse.ArgumentParser( prog = "merge_metaphlan_tables.py", + description = """Performs a table join on one or more metaphlan output files.""") +argp.add_argument( "aistms", metavar = "input.txt", nargs = "+", + help = "One or more tab-delimited text tables to join" ) + +__doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + +argp.usage = argp.format_usage()[7:]+"\n\n\tPlease make sure to supply file paths to the files to combine. If combining 3 files (Table1.txt, Table2.txt, and Table3.txt) the call should be:\n\n\t\tpython merge_metaphlan_tables.py Table1.txt Table2.txt Table3.txt > output.txt\n\n\tA wildcard to indicate all .txt files that start with Table can be used as follows:\n\n\t\tpython merge_metaphlan_tables.py Table*.txt > output.txt" + + +def _main( ): + args = argp.parse_args( ) + merge(args.aistms, [os.path.split(os.path.basename(f))[1] for f in args.aistms], 0, sys.stdout) + + +if __name__ == "__main__": + _main( )