0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # ==============================================================================
|
|
4 # Merge script: from MetaPhlAn output on single sample to a joined "clades vs samples" table
|
|
5 # Authors: Timothy Tickle (ttickle@hsph.harvard.edu) and Curtis Huttenhower (chuttenh@hsph.harvard.edu)
|
|
6 # ==============================================================================
|
|
7
|
|
8 import argparse
|
|
9 import csv
|
|
10 import os
|
|
11 import sys
|
|
12
|
|
13
|
|
14 def merge( aaastrIn, astrLabels, iCol, ostm ):
|
|
15 """
|
|
16 Outputs the table join of the given pre-split string collection.
|
|
17
|
|
18 :param aaastrIn: One or more split lines from which data are read.
|
|
19 :type aaastrIn: collection of collections of string collections
|
|
20 :param astrLabels: File names of input data.
|
|
21 :type astrLabels: collection of strings
|
|
22 :param iCol: Data column in which IDs are matched (zero-indexed).
|
|
23 :type iCol: int
|
|
24 :param ostm: Output stream to which matched rows are written.
|
|
25 :type ostm: output stream
|
|
26
|
|
27 """
|
|
28
|
|
29 setstrIDs = set()
|
|
30 """The final set of all IDs in any table."""
|
|
31 ahashIDs = [{} for i in range( len( aaastrIn ) )]
|
|
32 """One hash of IDs to row numbers for each input datum."""
|
|
33 aaastrData = [[] for i in range( len( aaastrIn ) )]
|
|
34 """One data table for each input datum."""
|
|
35 aastrHeaders = [[] for i in range( len( aaastrIn ) )]
|
|
36 """The list of non-ID headers for each input datum."""
|
|
37 strHeader = "ID"
|
|
38 """The ID column header."""
|
|
39
|
|
40 # For each input datum in each input stream...
|
|
41 pos = 0
|
|
42
|
|
43 for f in aaastrIn :
|
|
44 with open(f) as csvfile :
|
|
45 iIn = csv.reader(csvfile, csv.excel_tab)
|
|
46
|
|
47 # Lines from the current file, empty list to hold data, empty hash to hold ids
|
|
48 aastrData, hashIDs = (a[pos] for a in (aaastrData, ahashIDs))
|
|
49
|
|
50 iLine = -1
|
|
51 # For a line in the file
|
|
52 for astrLine in iIn:
|
|
53 iLine += 1
|
|
54
|
|
55 # ID is from first column, data are everything else
|
|
56 strID, astrData = astrLine[iCol], ( astrLine[:iCol] + astrLine[( iCol + 1 ):] )
|
|
57
|
|
58 hashIDs[strID] = iLine
|
|
59 aastrData.append( astrData )
|
|
60
|
|
61 # Batch merge every new ID key set
|
|
62 setstrIDs.update( hashIDs.keys( ) )
|
|
63
|
|
64 pos += 1
|
|
65
|
|
66 # Create writer
|
|
67 csvw = csv.writer( ostm, csv.excel_tab, lineterminator='\n' )
|
|
68
|
|
69 # Make the file names the column names
|
|
70 csvw.writerow( [strHeader] + [os.path.splitext(f)[0] for f in astrLabels] )
|
|
71
|
|
72 # Write out data
|
|
73 for strID in sorted( setstrIDs ):
|
|
74 astrOut = []
|
|
75 for iIn in range( len( aaastrIn ) ):
|
|
76 aastrData, hashIDs = (a[iIn] for a in (aaastrData, ahashIDs))
|
|
77 # Look up the row number of the current ID in the current dataset, if any
|
|
78 iID = hashIDs.get( strID )
|
|
79 # If not, start with no data; if yes, pull out stored data row
|
|
80 astrData = [0.0] if ( iID == None ) else aastrData[iID]
|
|
81 # Pad output data as needed
|
|
82 astrData += [None] * ( len( aastrHeaders[iIn] ) - len( astrData ) )
|
|
83 astrOut += astrData
|
|
84 csvw.writerow( [strID] + astrOut )
|
|
85
|
|
86
|
|
87 argp = argparse.ArgumentParser( prog = "merge_metaphlan_tables.py",
|
|
88 description = """Performs a table join on one or more metaphlan output files.""")
|
|
89 argp.add_argument( "aistms", metavar = "input.txt", nargs = "+",
|
|
90 help = "One or more tab-delimited text tables to join" )
|
|
91
|
|
92 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" )
|
|
93
|
|
94 argp.usage = argp.format_usage()[7:]+"\n\n\tPlease make sure to supply file paths to the files to combine. If combining 3 files (Table1.txt, Table2.txt, and Table3.txt) the call should be:\n\n\t\tpython merge_metaphlan_tables.py Table1.txt Table2.txt Table3.txt > output.txt\n\n\tA wildcard to indicate all .txt files that start with Table can be used as follows:\n\n\t\tpython merge_metaphlan_tables.py Table*.txt > output.txt"
|
|
95
|
|
96
|
|
97 def _main( ):
|
|
98 args = argp.parse_args( )
|
|
99 merge(args.aistms, [os.path.split(os.path.basename(f))[1] for f in args.aistms], 0, sys.stdout)
|
|
100
|
|
101
|
|
102 if __name__ == "__main__":
|
|
103 _main( )
|