diff kraken_combine.py @ 8:fe6f7ae0bec2 draft

Uploaded
author cschu
date Mon, 18 May 2015 15:45:09 -0400
parents 0916697409ea
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_combine.py	Mon May 18 15:45:09 2015 -0400
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+import sys
+import argparse
+
+def anabl_getContigsFromFASTQ(fn):
+    with open(fn) as fi:
+        for head in fi:
+            try:
+                seq, sep, qual = map(lambda x:x.strip(), [fi.next() for i in xrange(3)])
+                yield head[1:], seq, qual
+            except:
+                    break
+    pass
+
+def anabl_getContigsFromFASTA(fn):
+    with open(fn) as fi:
+            for head in fi:
+                try:                
+                    yield head[1:], fi.next().strip()
+                except:
+                    break
+    pass
+
+def toFASTQ(fields):
+    return '@%s\n%s\n+\n%s\n' % fields
+def toFASTA(fields):
+    return '>%s\n%s\n' % fields
+def getFASTQID(head):
+    return head.strip().split()[0].strip('@').strip('/1')
+def getFASTAID(head):
+    return head.strip().split()[0].strip('>').strip('/1')
+
+
+
+def main(argv):
+    DEFAULT_KRAKEN_DB= ''
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db.')  # we don't need taxonomy here
+    parser.add_argument('--in1', default='', help='Forward read file from metagenomic sample.')
+    parser.add_argument('--in2', default='', help='Reverse read file from metagenomic sample.')
+    parser.add_argument('--kraken-results1', help='File containing kraken classification for set1.')
+    parser.add_argument('--kraken-results2', help='File containing kraken classification for set2.')
+    parser.add_argument('--input-format', default='fa', help='fa (FASTA) or fq (FASTQ)')
+    parser.add_argument('--set1-output-left', type=str)
+    parser.add_argument('--set1-output-right', type=str)
+    parser.add_argument('--set2-output-left', type=str)
+    parser.add_argument('--set2-output-right', type=str)
+    parser.add_argument('--unclassified-output-left', type=str)
+    parser.add_argument('--unclassified-output-right', type=str)
+    parser.add_argument('--intersection-output-left', type=str)
+    parser.add_argument('--intersection-output-right', type=str)    
+    args = parser.parse_args()
+
+    if args.input_format == 'fa':
+        getContigs = anabl_getContigsFromFASTA  
+        writeFXRecord = toFASTA
+        getFXID = getFASTAID
+    elif args.input_format == 'fq':
+        getContigs = anabl_getContigsFromFASTQ
+        writeFXRecord = toFASTQ
+        getFXID = getFASTQID
+    else:
+        sys.stderr.write('Error: Unknown input format (%s).\n' % args.input_format)
+        sys.exit()
+
+    try:
+        set1 = [line.strip().split()[:2] for line in open(args.kraken_results1)]
+    except:
+        sys.stderr.write('Error: Set1 is missing, please specify --kraken_results1 parameter.\n')
+        sys.exit()
+
+    nReads = len(set1)
+    set1 = set(map(lambda x:x[1], filter(lambda x:x[0].startswith('C'), set1)))
+
+    try:
+        set2 = set([line.strip().split()[1] for line in open(args.kraken_results2) if line.startswith('C')])
+    except:
+        sys.stderr.write('Error: Set2 is missing, please specify --kraken_results2 parameter.\n')
+        sys.exit()
+
+    try:
+        set1_fwd = open(args.set1_output_left, 'wb')        
+        set2_fwd = open(args.set2_output_left, 'wb')
+        noclass_fwd = open(args.unclassified_output_left, 'wb')
+        undecided_fwd = open(args.intersection_output_left, 'wb')        
+    except:
+        sys.stderr.write('Error: Cannot open fwd outputfile(s).\n')
+        sys.exit()
+
+    try:
+        fwd = getContigs(args.in1) 
+    except:
+        sys.stderr.write('Error: Cannot open file %s.\n' % args.in1)
+        sys.exit()
+
+    rev = None
+    if args.in2:
+        try:
+            rev = getContigs(args.in2)
+        except:
+            sys.stderr.write('Error: Cannot open file %s.\n' % args.in2)
+            sys.exit()            
+        try:
+            set1_rev = open(args.set1_output_right, 'wb')
+            set2_rev = open(args.set2_output_right, 'wb')
+            noclass_rev = open(args.unclassified_output_right, 'wb')
+            undecided_rev = open(args.intersection_output_right, 'wb')
+        except:
+            sys.stderr.write('Error: Cannot open rev outputfile(s).\n')
+            sys.exit()            
+
+
+    for i in xrange(nReads):
+        try:
+            r1 = fwd.next()
+        except:
+            break
+        r2 = None
+        if rev is not None:
+            try:
+                r2 = rev.next()
+            except:
+                break
+    
+        id_ = getFXID(r1[0])
+
+        if id_ in set1:
+            if id_ not in set2:
+                set1_fwd.write(writeFXRecord(r1))
+                if set1_rev: 
+                    set1_rev.write(writeFXRecord(r2))
+            else:
+                undecided_fwd.write(writeFXRecord(r1))
+                if undecided_rev: 
+                    undecided_rev.write(writeFXRecord(r2))
+            
+        elif id_ in set2:
+            set2_fwd.write(writeFXRecord(r1))
+            if set2_rev: 
+                set2_rev.write(writeFXRecord(r2))
+        else:
+            noclass_fwd.write(writeFXRecord(r1))
+            if noclass_rev: 
+                noclass_rev.write(writeFXRecord(r2))
+        pass
+
+    set1_fwd.close()
+    set1_rev.close()
+    set2_fwd.close()
+    set2_rev.close()
+    noclass_fwd.close()
+    noclass_rev.close()
+    undecided_fwd.close()
+    undecided_rev.close()
+    pass
+
+if __name__ == '__main__': main(sys.argv)