Mercurial > repos > george-weingart > graphlan_import
comparison merge_metaphlan_tables.py @ 0:cac6247cb1d3 draft
graphlan_import
| author | george-weingart |
|---|---|
| date | Tue, 26 Aug 2014 14:51:29 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:cac6247cb1d3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # ============================================================================== | |
| 4 # Merge script: from MetaPhlAn output on single sample to a joined "clades vs samples" table | |
| 5 # Authors: Timothy Tickle (ttickle@hsph.harvard.edu) and Curtis Huttenhower (chuttenh@hsph.harvard.edu) | |
| 6 # ============================================================================== | |
| 7 | |
| 8 import argparse | |
| 9 import csv | |
| 10 import os | |
| 11 import sys | |
| 12 | |
| 13 | |
| 14 def merge( aaastrIn, astrLabels, iCol, ostm ): | |
| 15 """ | |
| 16 Outputs the table join of the given pre-split string collection. | |
| 17 | |
| 18 :param aaastrIn: One or more split lines from which data are read. | |
| 19 :type aaastrIn: collection of collections of string collections | |
| 20 :param astrLabels: File names of input data. | |
| 21 :type astrLabels: collection of strings | |
| 22 :param iCol: Data column in which IDs are matched (zero-indexed). | |
| 23 :type iCol: int | |
| 24 :param ostm: Output stream to which matched rows are written. | |
| 25 :type ostm: output stream | |
| 26 | |
| 27 """ | |
| 28 | |
| 29 setstrIDs = set() | |
| 30 """The final set of all IDs in any table.""" | |
| 31 ahashIDs = [{} for i in range( len( aaastrIn ) )] | |
| 32 """One hash of IDs to row numbers for each input datum.""" | |
| 33 aaastrData = [[] for i in range( len( aaastrIn ) )] | |
| 34 """One data table for each input datum.""" | |
| 35 aastrHeaders = [[] for i in range( len( aaastrIn ) )] | |
| 36 """The list of non-ID headers for each input datum.""" | |
| 37 strHeader = "ID" | |
| 38 """The ID column header.""" | |
| 39 | |
| 40 # For each input datum in each input stream... | |
| 41 pos = 0 | |
| 42 | |
| 43 for f in aaastrIn : | |
| 44 with open(f) as csvfile : | |
| 45 iIn = csv.reader(csvfile, csv.excel_tab) | |
| 46 | |
| 47 # Lines from the current file, empty list to hold data, empty hash to hold ids | |
| 48 aastrData, hashIDs = (a[pos] for a in (aaastrData, ahashIDs)) | |
| 49 | |
| 50 iLine = -1 | |
| 51 # For a line in the file | |
| 52 for astrLine in iIn: | |
| 53 iLine += 1 | |
| 54 | |
| 55 # ID is from first column, data are everything else | |
| 56 strID, astrData = astrLine[iCol], ( astrLine[:iCol] + astrLine[( iCol + 1 ):] ) | |
| 57 | |
| 58 hashIDs[strID] = iLine | |
| 59 aastrData.append( astrData ) | |
| 60 | |
| 61 # Batch merge every new ID key set | |
| 62 setstrIDs.update( hashIDs.keys( ) ) | |
| 63 | |
| 64 pos += 1 | |
| 65 | |
| 66 # Create writer | |
| 67 csvw = csv.writer( ostm, csv.excel_tab, lineterminator='\n' ) | |
| 68 | |
| 69 # Make the file names the column names | |
| 70 csvw.writerow( [strHeader] + [os.path.splitext(f)[0] for f in astrLabels] ) | |
| 71 | |
| 72 # Write out data | |
| 73 for strID in sorted( setstrIDs ): | |
| 74 astrOut = [] | |
| 75 for iIn in range( len( aaastrIn ) ): | |
| 76 aastrData, hashIDs = (a[iIn] for a in (aaastrData, ahashIDs)) | |
| 77 # Look up the row number of the current ID in the current dataset, if any | |
| 78 iID = hashIDs.get( strID ) | |
| 79 # If not, start with no data; if yes, pull out stored data row | |
| 80 astrData = [0.0] if ( iID == None ) else aastrData[iID] | |
| 81 # Pad output data as needed | |
| 82 astrData += [None] * ( len( aastrHeaders[iIn] ) - len( astrData ) ) | |
| 83 astrOut += astrData | |
| 84 csvw.writerow( [strID] + astrOut ) | |
| 85 | |
| 86 | |
| 87 argp = argparse.ArgumentParser( prog = "merge_metaphlan_tables.py", | |
| 88 description = """Performs a table join on one or more metaphlan output files.""") | |
| 89 argp.add_argument( "aistms", metavar = "input.txt", nargs = "+", | |
| 90 help = "One or more tab-delimited text tables to join" ) | |
| 91 | |
| 92 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) | |
| 93 | |
| 94 argp.usage = argp.format_usage()[7:]+"\n\n\tPlease make sure to supply file paths to the files to combine. If combining 3 files (Table1.txt, Table2.txt, and Table3.txt) the call should be:\n\n\t\tpython merge_metaphlan_tables.py Table1.txt Table2.txt Table3.txt > output.txt\n\n\tA wildcard to indicate all .txt files that start with Table can be used as follows:\n\n\t\tpython merge_metaphlan_tables.py Table*.txt > output.txt" | |
| 95 | |
| 96 | |
| 97 def _main( ): | |
| 98 args = argp.parse_args( ) | |
| 99 merge(args.aistms, [os.path.split(os.path.basename(f))[1] for f in args.aistms], 0, sys.stdout) | |
| 100 | |
| 101 | |
| 102 if __name__ == "__main__": | |
| 103 _main( ) |
