Mercurial > repos > cschu > kraken_tools
comparison kraken_combine.py @ 8:fe6f7ae0bec2 draft
Uploaded
author | cschu |
---|---|
date | Mon, 18 May 2015 15:45:09 -0400 |
parents | 0916697409ea |
children |
comparison
equal
deleted
inserted
replaced
7:5f6f86c9937b | 8:fe6f7ae0bec2 |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 import argparse | |
4 | |
5 def anabl_getContigsFromFASTQ(fn): | |
6 with open(fn) as fi: | |
7 for head in fi: | |
8 try: | |
9 seq, sep, qual = map(lambda x:x.strip(), [fi.next() for i in xrange(3)]) | |
10 yield head[1:], seq, qual | |
11 except: | |
12 break | |
13 pass | |
14 | |
15 def anabl_getContigsFromFASTA(fn): | |
16 with open(fn) as fi: | |
17 for head in fi: | |
18 try: | |
19 yield head[1:], fi.next().strip() | |
20 except: | |
21 break | |
22 pass | |
23 | |
24 def toFASTQ(fields): | |
25 return '@%s\n%s\n+\n%s\n' % fields | |
26 def toFASTA(fields): | |
27 return '>%s\n%s\n' % fields | |
28 def getFASTQID(head): | |
29 return head.strip().split()[0].strip('@').strip('/1') | |
30 def getFASTAID(head): | |
31 return head.strip().split()[0].strip('>').strip('/1') | |
32 | |
33 | |
34 | |
35 def main(argv): | |
36 DEFAULT_KRAKEN_DB= '' | |
37 parser = argparse.ArgumentParser(description='') | |
38 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db.') # we don't need taxonomy here | |
39 parser.add_argument('--in1', default='', help='Forward read file from metagenomic sample.') | |
40 parser.add_argument('--in2', default='', help='Reverse read file from metagenomic sample.') | |
41 parser.add_argument('--kraken-results1', help='File containing kraken classification for set1.') | |
42 parser.add_argument('--kraken-results2', help='File containing kraken classification for set2.') | |
43 parser.add_argument('--input-format', default='fa', help='fa (FASTA) or fq (FASTQ)') | |
44 parser.add_argument('--set1-output-left', type=str) | |
45 parser.add_argument('--set1-output-right', type=str) | |
46 parser.add_argument('--set2-output-left', type=str) | |
47 parser.add_argument('--set2-output-right', type=str) | |
48 parser.add_argument('--unclassified-output-left', type=str) | |
49 parser.add_argument('--unclassified-output-right', type=str) | |
50 parser.add_argument('--intersection-output-left', type=str) | |
51 parser.add_argument('--intersection-output-right', type=str) | |
52 args = parser.parse_args() | |
53 | |
54 if args.input_format == 'fa': | |
55 getContigs = anabl_getContigsFromFASTA | |
56 writeFXRecord = toFASTA | |
57 getFXID = getFASTAID | |
58 elif args.input_format == 'fq': | |
59 getContigs = anabl_getContigsFromFASTQ | |
60 writeFXRecord = toFASTQ | |
61 getFXID = getFASTQID | |
62 else: | |
63 sys.stderr.write('Error: Unknown input format (%s).\n' % args.input_format) | |
64 sys.exit() | |
65 | |
66 try: | |
67 set1 = [line.strip().split()[:2] for line in open(args.kraken_results1)] | |
68 except: | |
69 sys.stderr.write('Error: Set1 is missing, please specify --kraken_results1 parameter.\n') | |
70 sys.exit() | |
71 | |
72 nReads = len(set1) | |
73 set1 = set(map(lambda x:x[1], filter(lambda x:x[0].startswith('C'), set1))) | |
74 | |
75 try: | |
76 set2 = set([line.strip().split()[1] for line in open(args.kraken_results2) if line.startswith('C')]) | |
77 except: | |
78 sys.stderr.write('Error: Set2 is missing, please specify --kraken_results2 parameter.\n') | |
79 sys.exit() | |
80 | |
81 try: | |
82 set1_fwd = open(args.set1_output_left, 'wb') | |
83 set2_fwd = open(args.set2_output_left, 'wb') | |
84 noclass_fwd = open(args.unclassified_output_left, 'wb') | |
85 undecided_fwd = open(args.intersection_output_left, 'wb') | |
86 except: | |
87 sys.stderr.write('Error: Cannot open fwd outputfile(s).\n') | |
88 sys.exit() | |
89 | |
90 try: | |
91 fwd = getContigs(args.in1) | |
92 except: | |
93 sys.stderr.write('Error: Cannot open file %s.\n' % args.in1) | |
94 sys.exit() | |
95 | |
96 rev = None | |
97 if args.in2: | |
98 try: | |
99 rev = getContigs(args.in2) | |
100 except: | |
101 sys.stderr.write('Error: Cannot open file %s.\n' % args.in2) | |
102 sys.exit() | |
103 try: | |
104 set1_rev = open(args.set1_output_right, 'wb') | |
105 set2_rev = open(args.set2_output_right, 'wb') | |
106 noclass_rev = open(args.unclassified_output_right, 'wb') | |
107 undecided_rev = open(args.intersection_output_right, 'wb') | |
108 except: | |
109 sys.stderr.write('Error: Cannot open rev outputfile(s).\n') | |
110 sys.exit() | |
111 | |
112 | |
113 for i in xrange(nReads): | |
114 try: | |
115 r1 = fwd.next() | |
116 except: | |
117 break | |
118 r2 = None | |
119 if rev is not None: | |
120 try: | |
121 r2 = rev.next() | |
122 except: | |
123 break | |
124 | |
125 id_ = getFXID(r1[0]) | |
126 | |
127 if id_ in set1: | |
128 if id_ not in set2: | |
129 set1_fwd.write(writeFXRecord(r1)) | |
130 if set1_rev: | |
131 set1_rev.write(writeFXRecord(r2)) | |
132 else: | |
133 undecided_fwd.write(writeFXRecord(r1)) | |
134 if undecided_rev: | |
135 undecided_rev.write(writeFXRecord(r2)) | |
136 | |
137 elif id_ in set2: | |
138 set2_fwd.write(writeFXRecord(r1)) | |
139 if set2_rev: | |
140 set2_rev.write(writeFXRecord(r2)) | |
141 else: | |
142 noclass_fwd.write(writeFXRecord(r1)) | |
143 if noclass_rev: | |
144 noclass_rev.write(writeFXRecord(r2)) | |
145 pass | |
146 | |
147 set1_fwd.close() | |
148 set1_rev.close() | |
149 set2_fwd.close() | |
150 set2_rev.close() | |
151 noclass_fwd.close() | |
152 noclass_rev.close() | |
153 undecided_fwd.close() | |
154 undecided_rev.close() | |
155 pass | |
156 | |
157 if __name__ == '__main__': main(sys.argv) |