graphlan_import: merge_metaphlan

annotate merge_metaphlan_tables.py @ 0:cac6247cb1d3 draft

graphlan_import

author	george-weingart
date	Tue, 26 Aug 2014 14:51:29 -0400
parents
children

rev	line source
0 cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	1 #!/usr/bin/env python
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	2
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	3 # ==============================================================================
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	4 # Merge script: from MetaPhlAn output on single sample to a joined "clades vs samples" table
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	5 # Authors: Timothy Tickle (ttickle@hsph.harvard.edu) and Curtis Huttenhower (chuttenh@hsph.harvard.edu)
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	6 # ==============================================================================
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	7
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	8 import argparse
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	9 import csv
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	10 import os
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	11 import sys
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	12
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	13
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	14 def merge( aaastrIn, astrLabels, iCol, ostm ):
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	15 """
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	16 Outputs the table join of the given pre-split string collection.
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	17
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	18 :param aaastrIn: One or more split lines from which data are read.
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	19 :type aaastrIn: collection of collections of string collections
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	20 :param astrLabels: File names of input data.
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	21 :type astrLabels: collection of strings
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	22 :param iCol: Data column in which IDs are matched (zero-indexed).
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	23 :type iCol: int
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	24 :param ostm: Output stream to which matched rows are written.
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	25 :type ostm: output stream
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	26
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	27 """
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	28
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	29 setstrIDs = set()
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	30 """The final set of all IDs in any table."""
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	31 ahashIDs = [{} for i in range( len( aaastrIn ) )]
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	32 """One hash of IDs to row numbers for each input datum."""
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	33 aaastrData = [[] for i in range( len( aaastrIn ) )]
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	34 """One data table for each input datum."""
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	35 aastrHeaders = [[] for i in range( len( aaastrIn ) )]
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	36 """The list of non-ID headers for each input datum."""
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	37 strHeader = "ID"
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	38 """The ID column header."""
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	39
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	40 # For each input datum in each input stream...
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	41 pos = 0
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	42
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	43 for f in aaastrIn :
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	44 with open(f) as csvfile :
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	45 iIn = csv.reader(csvfile, csv.excel_tab)
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	46
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	47 # Lines from the current file, empty list to hold data, empty hash to hold ids
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	48 aastrData, hashIDs = (a[pos] for a in (aaastrData, ahashIDs))
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	49
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	50 iLine = -1
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	51 # For a line in the file
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	52 for astrLine in iIn:
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	53 iLine += 1
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	54
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	55 # ID is from first column, data are everything else
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	56 strID, astrData = astrLine[iCol], ( astrLine[:iCol] + astrLine[( iCol + 1 ):] )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	57
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	58 hashIDs[strID] = iLine
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	59 aastrData.append( astrData )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	60
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	61 # Batch merge every new ID key set
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	62 setstrIDs.update( hashIDs.keys( ) )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	63
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	64 pos += 1
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	65
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	66 # Create writer
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	67 csvw = csv.writer( ostm, csv.excel_tab, lineterminator='\n' )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	68
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	69 # Make the file names the column names
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	70 csvw.writerow( [strHeader] + [os.path.splitext(f)[0] for f in astrLabels] )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	71
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	72 # Write out data
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	73 for strID in sorted( setstrIDs ):
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	74 astrOut = []
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	75 for iIn in range( len( aaastrIn ) ):
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	76 aastrData, hashIDs = (a[iIn] for a in (aaastrData, ahashIDs))
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	77 # Look up the row number of the current ID in the current dataset, if any
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	78 iID = hashIDs.get( strID )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	79 # If not, start with no data; if yes, pull out stored data row
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	80 astrData = [0.0] if ( iID == None ) else aastrData[iID]
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	81 # Pad output data as needed
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	82 astrData += [None] * ( len( aastrHeaders[iIn] ) - len( astrData ) )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	83 astrOut += astrData
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	84 csvw.writerow( [strID] + astrOut )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	85
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	86
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	87 argp = argparse.ArgumentParser( prog = "merge_metaphlan_tables.py",
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	88 description = """Performs a table join on one or more metaphlan output files.""")
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	89 argp.add_argument( "aistms", metavar = "input.txt", nargs = "+",
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	90 help = "One or more tab-delimited text tables to join" )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	91
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	92 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	93
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	94 argp.usage = argp.format_usage()[7:]+"\n\n\tPlease make sure to supply file paths to the files to combine. If combining 3 files (Table1.txt, Table2.txt, and Table3.txt) the call should be:\n\n\t\tpython merge_metaphlan_tables.py Table1.txt Table2.txt Table3.txt > output.txt\n\n\tA wildcard to indicate all .txt files that start with Table can be used as follows:\n\n\t\tpython merge_metaphlan_tables.py Table*.txt > output.txt"
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	95
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	96
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	97 def _main( ):
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	98 args = argp.parse_args( )
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	99 merge(args.aistms, [os.path.split(os.path.basename(f))[1] for f in args.aistms], 0, sys.stdout)
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	100
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	101
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	102 if __name__ == "__main__":
cac6247cb1d3 graphlan_import george-weingart parents: diff changeset	103 _main( )

Mercurial > repos > george-weingart > graphlan_import

annotate merge_metaphlan_tables.py @ 0:cac6247cb1d3 draft