Mercurial > repos > cschu > kraken_tools
comparison kraken_combine.py @ 14:fd27c97c8366 draft default tip
Uploaded
| author | cschu |
|---|---|
| date | Mon, 18 May 2015 15:55:47 -0400 |
| parents | 0916697409ea |
| children |
comparison
equal
deleted
inserted
replaced
| 13:dc02dbf5e1a9 | 14:fd27c97c8366 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 import argparse | |
| 4 | |
| 5 def anabl_getContigsFromFASTQ(fn): | |
| 6 with open(fn) as fi: | |
| 7 for head in fi: | |
| 8 try: | |
| 9 seq, sep, qual = map(lambda x:x.strip(), [fi.next() for i in xrange(3)]) | |
| 10 yield head[1:], seq, qual | |
| 11 except: | |
| 12 break | |
| 13 pass | |
| 14 | |
| 15 def anabl_getContigsFromFASTA(fn): | |
| 16 with open(fn) as fi: | |
| 17 for head in fi: | |
| 18 try: | |
| 19 yield head[1:], fi.next().strip() | |
| 20 except: | |
| 21 break | |
| 22 pass | |
| 23 | |
| 24 def toFASTQ(fields): | |
| 25 return '@%s\n%s\n+\n%s\n' % fields | |
| 26 def toFASTA(fields): | |
| 27 return '>%s\n%s\n' % fields | |
| 28 def getFASTQID(head): | |
| 29 return head.strip().split()[0].strip('@').strip('/1') | |
| 30 def getFASTAID(head): | |
| 31 return head.strip().split()[0].strip('>').strip('/1') | |
| 32 | |
| 33 | |
| 34 | |
| 35 def main(argv): | |
| 36 DEFAULT_KRAKEN_DB= '' | |
| 37 parser = argparse.ArgumentParser(description='') | |
| 38 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db.') # we don't need taxonomy here | |
| 39 parser.add_argument('--in1', default='', help='Forward read file from metagenomic sample.') | |
| 40 parser.add_argument('--in2', default='', help='Reverse read file from metagenomic sample.') | |
| 41 parser.add_argument('--kraken-results1', help='File containing kraken classification for set1.') | |
| 42 parser.add_argument('--kraken-results2', help='File containing kraken classification for set2.') | |
| 43 parser.add_argument('--input-format', default='fa', help='fa (FASTA) or fq (FASTQ)') | |
| 44 parser.add_argument('--set1-output-left', type=str) | |
| 45 parser.add_argument('--set1-output-right', type=str) | |
| 46 parser.add_argument('--set2-output-left', type=str) | |
| 47 parser.add_argument('--set2-output-right', type=str) | |
| 48 parser.add_argument('--unclassified-output-left', type=str) | |
| 49 parser.add_argument('--unclassified-output-right', type=str) | |
| 50 parser.add_argument('--intersection-output-left', type=str) | |
| 51 parser.add_argument('--intersection-output-right', type=str) | |
| 52 args = parser.parse_args() | |
| 53 | |
| 54 if args.input_format == 'fa': | |
| 55 getContigs = anabl_getContigsFromFASTA | |
| 56 writeFXRecord = toFASTA | |
| 57 getFXID = getFASTAID | |
| 58 elif args.input_format == 'fq': | |
| 59 getContigs = anabl_getContigsFromFASTQ | |
| 60 writeFXRecord = toFASTQ | |
| 61 getFXID = getFASTQID | |
| 62 else: | |
| 63 sys.stderr.write('Error: Unknown input format (%s).\n' % args.input_format) | |
| 64 sys.exit() | |
| 65 | |
| 66 try: | |
| 67 set1 = [line.strip().split()[:2] for line in open(args.kraken_results1)] | |
| 68 except: | |
| 69 sys.stderr.write('Error: Set1 is missing, please specify --kraken_results1 parameter.\n') | |
| 70 sys.exit() | |
| 71 | |
| 72 nReads = len(set1) | |
| 73 set1 = set(map(lambda x:x[1], filter(lambda x:x[0].startswith('C'), set1))) | |
| 74 | |
| 75 try: | |
| 76 set2 = set([line.strip().split()[1] for line in open(args.kraken_results2) if line.startswith('C')]) | |
| 77 except: | |
| 78 sys.stderr.write('Error: Set2 is missing, please specify --kraken_results2 parameter.\n') | |
| 79 sys.exit() | |
| 80 | |
| 81 try: | |
| 82 set1_fwd = open(args.set1_output_left, 'wb') | |
| 83 set2_fwd = open(args.set2_output_left, 'wb') | |
| 84 noclass_fwd = open(args.unclassified_output_left, 'wb') | |
| 85 undecided_fwd = open(args.intersection_output_left, 'wb') | |
| 86 except: | |
| 87 sys.stderr.write('Error: Cannot open fwd outputfile(s).\n') | |
| 88 sys.exit() | |
| 89 | |
| 90 try: | |
| 91 fwd = getContigs(args.in1) | |
| 92 except: | |
| 93 sys.stderr.write('Error: Cannot open file %s.\n' % args.in1) | |
| 94 sys.exit() | |
| 95 | |
| 96 rev = None | |
| 97 if args.in2: | |
| 98 try: | |
| 99 rev = getContigs(args.in2) | |
| 100 except: | |
| 101 sys.stderr.write('Error: Cannot open file %s.\n' % args.in2) | |
| 102 sys.exit() | |
| 103 try: | |
| 104 set1_rev = open(args.set1_output_right, 'wb') | |
| 105 set2_rev = open(args.set2_output_right, 'wb') | |
| 106 noclass_rev = open(args.unclassified_output_right, 'wb') | |
| 107 undecided_rev = open(args.intersection_output_right, 'wb') | |
| 108 except: | |
| 109 sys.stderr.write('Error: Cannot open rev outputfile(s).\n') | |
| 110 sys.exit() | |
| 111 | |
| 112 | |
| 113 for i in xrange(nReads): | |
| 114 try: | |
| 115 r1 = fwd.next() | |
| 116 except: | |
| 117 break | |
| 118 r2 = None | |
| 119 if rev is not None: | |
| 120 try: | |
| 121 r2 = rev.next() | |
| 122 except: | |
| 123 break | |
| 124 | |
| 125 id_ = getFXID(r1[0]) | |
| 126 | |
| 127 if id_ in set1: | |
| 128 if id_ not in set2: | |
| 129 set1_fwd.write(writeFXRecord(r1)) | |
| 130 if set1_rev: | |
| 131 set1_rev.write(writeFXRecord(r2)) | |
| 132 else: | |
| 133 undecided_fwd.write(writeFXRecord(r1)) | |
| 134 if undecided_rev: | |
| 135 undecided_rev.write(writeFXRecord(r2)) | |
| 136 | |
| 137 elif id_ in set2: | |
| 138 set2_fwd.write(writeFXRecord(r1)) | |
| 139 if set2_rev: | |
| 140 set2_rev.write(writeFXRecord(r2)) | |
| 141 else: | |
| 142 noclass_fwd.write(writeFXRecord(r1)) | |
| 143 if noclass_rev: | |
| 144 noclass_rev.write(writeFXRecord(r2)) | |
| 145 pass | |
| 146 | |
| 147 set1_fwd.close() | |
| 148 set1_rev.close() | |
| 149 set2_fwd.close() | |
| 150 set2_rev.close() | |
| 151 noclass_fwd.close() | |
| 152 noclass_rev.close() | |
| 153 undecided_fwd.close() | |
| 154 undecided_rev.close() | |
| 155 pass | |
| 156 | |
| 157 if __name__ == '__main__': main(sys.argv) |
