annotate kraken_combine.py @ 0:0916697409ea draft

Uploaded
author cschu
date Mon, 18 May 2015 15:29:05 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0916697409ea Uploaded
cschu
parents:
diff changeset
1 #!/usr/bin/env python
0916697409ea Uploaded
cschu
parents:
diff changeset
2 import sys
0916697409ea Uploaded
cschu
parents:
diff changeset
3 import argparse
0916697409ea Uploaded
cschu
parents:
diff changeset
4
0916697409ea Uploaded
cschu
parents:
diff changeset
5 def anabl_getContigsFromFASTQ(fn):
0916697409ea Uploaded
cschu
parents:
diff changeset
6 with open(fn) as fi:
0916697409ea Uploaded
cschu
parents:
diff changeset
7 for head in fi:
0916697409ea Uploaded
cschu
parents:
diff changeset
8 try:
0916697409ea Uploaded
cschu
parents:
diff changeset
9 seq, sep, qual = map(lambda x:x.strip(), [fi.next() for i in xrange(3)])
0916697409ea Uploaded
cschu
parents:
diff changeset
10 yield head[1:], seq, qual
0916697409ea Uploaded
cschu
parents:
diff changeset
11 except:
0916697409ea Uploaded
cschu
parents:
diff changeset
12 break
0916697409ea Uploaded
cschu
parents:
diff changeset
13 pass
0916697409ea Uploaded
cschu
parents:
diff changeset
14
0916697409ea Uploaded
cschu
parents:
diff changeset
15 def anabl_getContigsFromFASTA(fn):
0916697409ea Uploaded
cschu
parents:
diff changeset
16 with open(fn) as fi:
0916697409ea Uploaded
cschu
parents:
diff changeset
17 for head in fi:
0916697409ea Uploaded
cschu
parents:
diff changeset
18 try:
0916697409ea Uploaded
cschu
parents:
diff changeset
19 yield head[1:], fi.next().strip()
0916697409ea Uploaded
cschu
parents:
diff changeset
20 except:
0916697409ea Uploaded
cschu
parents:
diff changeset
21 break
0916697409ea Uploaded
cschu
parents:
diff changeset
22 pass
0916697409ea Uploaded
cschu
parents:
diff changeset
23
0916697409ea Uploaded
cschu
parents:
diff changeset
24 def toFASTQ(fields):
0916697409ea Uploaded
cschu
parents:
diff changeset
25 return '@%s\n%s\n+\n%s\n' % fields
0916697409ea Uploaded
cschu
parents:
diff changeset
26 def toFASTA(fields):
0916697409ea Uploaded
cschu
parents:
diff changeset
27 return '>%s\n%s\n' % fields
0916697409ea Uploaded
cschu
parents:
diff changeset
28 def getFASTQID(head):
0916697409ea Uploaded
cschu
parents:
diff changeset
29 return head.strip().split()[0].strip('@').strip('/1')
0916697409ea Uploaded
cschu
parents:
diff changeset
30 def getFASTAID(head):
0916697409ea Uploaded
cschu
parents:
diff changeset
31 return head.strip().split()[0].strip('>').strip('/1')
0916697409ea Uploaded
cschu
parents:
diff changeset
32
0916697409ea Uploaded
cschu
parents:
diff changeset
33
0916697409ea Uploaded
cschu
parents:
diff changeset
34
0916697409ea Uploaded
cschu
parents:
diff changeset
35 def main(argv):
0916697409ea Uploaded
cschu
parents:
diff changeset
36 DEFAULT_KRAKEN_DB= ''
0916697409ea Uploaded
cschu
parents:
diff changeset
37 parser = argparse.ArgumentParser(description='')
0916697409ea Uploaded
cschu
parents:
diff changeset
38 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db.') # we don't need taxonomy here
0916697409ea Uploaded
cschu
parents:
diff changeset
39 parser.add_argument('--in1', default='', help='Forward read file from metagenomic sample.')
0916697409ea Uploaded
cschu
parents:
diff changeset
40 parser.add_argument('--in2', default='', help='Reverse read file from metagenomic sample.')
0916697409ea Uploaded
cschu
parents:
diff changeset
41 parser.add_argument('--kraken-results1', help='File containing kraken classification for set1.')
0916697409ea Uploaded
cschu
parents:
diff changeset
42 parser.add_argument('--kraken-results2', help='File containing kraken classification for set2.')
0916697409ea Uploaded
cschu
parents:
diff changeset
43 parser.add_argument('--input-format', default='fa', help='fa (FASTA) or fq (FASTQ)')
0916697409ea Uploaded
cschu
parents:
diff changeset
44 parser.add_argument('--set1-output-left', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
45 parser.add_argument('--set1-output-right', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
46 parser.add_argument('--set2-output-left', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
47 parser.add_argument('--set2-output-right', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
48 parser.add_argument('--unclassified-output-left', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
49 parser.add_argument('--unclassified-output-right', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
50 parser.add_argument('--intersection-output-left', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
51 parser.add_argument('--intersection-output-right', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
52 args = parser.parse_args()
0916697409ea Uploaded
cschu
parents:
diff changeset
53
0916697409ea Uploaded
cschu
parents:
diff changeset
54 if args.input_format == 'fa':
0916697409ea Uploaded
cschu
parents:
diff changeset
55 getContigs = anabl_getContigsFromFASTA
0916697409ea Uploaded
cschu
parents:
diff changeset
56 writeFXRecord = toFASTA
0916697409ea Uploaded
cschu
parents:
diff changeset
57 getFXID = getFASTAID
0916697409ea Uploaded
cschu
parents:
diff changeset
58 elif args.input_format == 'fq':
0916697409ea Uploaded
cschu
parents:
diff changeset
59 getContigs = anabl_getContigsFromFASTQ
0916697409ea Uploaded
cschu
parents:
diff changeset
60 writeFXRecord = toFASTQ
0916697409ea Uploaded
cschu
parents:
diff changeset
61 getFXID = getFASTQID
0916697409ea Uploaded
cschu
parents:
diff changeset
62 else:
0916697409ea Uploaded
cschu
parents:
diff changeset
63 sys.stderr.write('Error: Unknown input format (%s).\n' % args.input_format)
0916697409ea Uploaded
cschu
parents:
diff changeset
64 sys.exit()
0916697409ea Uploaded
cschu
parents:
diff changeset
65
0916697409ea Uploaded
cschu
parents:
diff changeset
66 try:
0916697409ea Uploaded
cschu
parents:
diff changeset
67 set1 = [line.strip().split()[:2] for line in open(args.kraken_results1)]
0916697409ea Uploaded
cschu
parents:
diff changeset
68 except:
0916697409ea Uploaded
cschu
parents:
diff changeset
69 sys.stderr.write('Error: Set1 is missing, please specify --kraken_results1 parameter.\n')
0916697409ea Uploaded
cschu
parents:
diff changeset
70 sys.exit()
0916697409ea Uploaded
cschu
parents:
diff changeset
71
0916697409ea Uploaded
cschu
parents:
diff changeset
72 nReads = len(set1)
0916697409ea Uploaded
cschu
parents:
diff changeset
73 set1 = set(map(lambda x:x[1], filter(lambda x:x[0].startswith('C'), set1)))
0916697409ea Uploaded
cschu
parents:
diff changeset
74
0916697409ea Uploaded
cschu
parents:
diff changeset
75 try:
0916697409ea Uploaded
cschu
parents:
diff changeset
76 set2 = set([line.strip().split()[1] for line in open(args.kraken_results2) if line.startswith('C')])
0916697409ea Uploaded
cschu
parents:
diff changeset
77 except:
0916697409ea Uploaded
cschu
parents:
diff changeset
78 sys.stderr.write('Error: Set2 is missing, please specify --kraken_results2 parameter.\n')
0916697409ea Uploaded
cschu
parents:
diff changeset
79 sys.exit()
0916697409ea Uploaded
cschu
parents:
diff changeset
80
0916697409ea Uploaded
cschu
parents:
diff changeset
81 try:
0916697409ea Uploaded
cschu
parents:
diff changeset
82 set1_fwd = open(args.set1_output_left, 'wb')
0916697409ea Uploaded
cschu
parents:
diff changeset
83 set2_fwd = open(args.set2_output_left, 'wb')
0916697409ea Uploaded
cschu
parents:
diff changeset
84 noclass_fwd = open(args.unclassified_output_left, 'wb')
0916697409ea Uploaded
cschu
parents:
diff changeset
85 undecided_fwd = open(args.intersection_output_left, 'wb')
0916697409ea Uploaded
cschu
parents:
diff changeset
86 except:
0916697409ea Uploaded
cschu
parents:
diff changeset
87 sys.stderr.write('Error: Cannot open fwd outputfile(s).\n')
0916697409ea Uploaded
cschu
parents:
diff changeset
88 sys.exit()
0916697409ea Uploaded
cschu
parents:
diff changeset
89
0916697409ea Uploaded
cschu
parents:
diff changeset
90 try:
0916697409ea Uploaded
cschu
parents:
diff changeset
91 fwd = getContigs(args.in1)
0916697409ea Uploaded
cschu
parents:
diff changeset
92 except:
0916697409ea Uploaded
cschu
parents:
diff changeset
93 sys.stderr.write('Error: Cannot open file %s.\n' % args.in1)
0916697409ea Uploaded
cschu
parents:
diff changeset
94 sys.exit()
0916697409ea Uploaded
cschu
parents:
diff changeset
95
0916697409ea Uploaded
cschu
parents:
diff changeset
96 rev = None
0916697409ea Uploaded
cschu
parents:
diff changeset
97 if args.in2:
0916697409ea Uploaded
cschu
parents:
diff changeset
98 try:
0916697409ea Uploaded
cschu
parents:
diff changeset
99 rev = getContigs(args.in2)
0916697409ea Uploaded
cschu
parents:
diff changeset
100 except:
0916697409ea Uploaded
cschu
parents:
diff changeset
101 sys.stderr.write('Error: Cannot open file %s.\n' % args.in2)
0916697409ea Uploaded
cschu
parents:
diff changeset
102 sys.exit()
0916697409ea Uploaded
cschu
parents:
diff changeset
103 try:
0916697409ea Uploaded
cschu
parents:
diff changeset
104 set1_rev = open(args.set1_output_right, 'wb')
0916697409ea Uploaded
cschu
parents:
diff changeset
105 set2_rev = open(args.set2_output_right, 'wb')
0916697409ea Uploaded
cschu
parents:
diff changeset
106 noclass_rev = open(args.unclassified_output_right, 'wb')
0916697409ea Uploaded
cschu
parents:
diff changeset
107 undecided_rev = open(args.intersection_output_right, 'wb')
0916697409ea Uploaded
cschu
parents:
diff changeset
108 except:
0916697409ea Uploaded
cschu
parents:
diff changeset
109 sys.stderr.write('Error: Cannot open rev outputfile(s).\n')
0916697409ea Uploaded
cschu
parents:
diff changeset
110 sys.exit()
0916697409ea Uploaded
cschu
parents:
diff changeset
111
0916697409ea Uploaded
cschu
parents:
diff changeset
112
0916697409ea Uploaded
cschu
parents:
diff changeset
113 for i in xrange(nReads):
0916697409ea Uploaded
cschu
parents:
diff changeset
114 try:
0916697409ea Uploaded
cschu
parents:
diff changeset
115 r1 = fwd.next()
0916697409ea Uploaded
cschu
parents:
diff changeset
116 except:
0916697409ea Uploaded
cschu
parents:
diff changeset
117 break
0916697409ea Uploaded
cschu
parents:
diff changeset
118 r2 = None
0916697409ea Uploaded
cschu
parents:
diff changeset
119 if rev is not None:
0916697409ea Uploaded
cschu
parents:
diff changeset
120 try:
0916697409ea Uploaded
cschu
parents:
diff changeset
121 r2 = rev.next()
0916697409ea Uploaded
cschu
parents:
diff changeset
122 except:
0916697409ea Uploaded
cschu
parents:
diff changeset
123 break
0916697409ea Uploaded
cschu
parents:
diff changeset
124
0916697409ea Uploaded
cschu
parents:
diff changeset
125 id_ = getFXID(r1[0])
0916697409ea Uploaded
cschu
parents:
diff changeset
126
0916697409ea Uploaded
cschu
parents:
diff changeset
127 if id_ in set1:
0916697409ea Uploaded
cschu
parents:
diff changeset
128 if id_ not in set2:
0916697409ea Uploaded
cschu
parents:
diff changeset
129 set1_fwd.write(writeFXRecord(r1))
0916697409ea Uploaded
cschu
parents:
diff changeset
130 if set1_rev:
0916697409ea Uploaded
cschu
parents:
diff changeset
131 set1_rev.write(writeFXRecord(r2))
0916697409ea Uploaded
cschu
parents:
diff changeset
132 else:
0916697409ea Uploaded
cschu
parents:
diff changeset
133 undecided_fwd.write(writeFXRecord(r1))
0916697409ea Uploaded
cschu
parents:
diff changeset
134 if undecided_rev:
0916697409ea Uploaded
cschu
parents:
diff changeset
135 undecided_rev.write(writeFXRecord(r2))
0916697409ea Uploaded
cschu
parents:
diff changeset
136
0916697409ea Uploaded
cschu
parents:
diff changeset
137 elif id_ in set2:
0916697409ea Uploaded
cschu
parents:
diff changeset
138 set2_fwd.write(writeFXRecord(r1))
0916697409ea Uploaded
cschu
parents:
diff changeset
139 if set2_rev:
0916697409ea Uploaded
cschu
parents:
diff changeset
140 set2_rev.write(writeFXRecord(r2))
0916697409ea Uploaded
cschu
parents:
diff changeset
141 else:
0916697409ea Uploaded
cschu
parents:
diff changeset
142 noclass_fwd.write(writeFXRecord(r1))
0916697409ea Uploaded
cschu
parents:
diff changeset
143 if noclass_rev:
0916697409ea Uploaded
cschu
parents:
diff changeset
144 noclass_rev.write(writeFXRecord(r2))
0916697409ea Uploaded
cschu
parents:
diff changeset
145 pass
0916697409ea Uploaded
cschu
parents:
diff changeset
146
0916697409ea Uploaded
cschu
parents:
diff changeset
147 set1_fwd.close()
0916697409ea Uploaded
cschu
parents:
diff changeset
148 set1_rev.close()
0916697409ea Uploaded
cschu
parents:
diff changeset
149 set2_fwd.close()
0916697409ea Uploaded
cschu
parents:
diff changeset
150 set2_rev.close()
0916697409ea Uploaded
cschu
parents:
diff changeset
151 noclass_fwd.close()
0916697409ea Uploaded
cschu
parents:
diff changeset
152 noclass_rev.close()
0916697409ea Uploaded
cschu
parents:
diff changeset
153 undecided_fwd.close()
0916697409ea Uploaded
cschu
parents:
diff changeset
154 undecided_rev.close()
0916697409ea Uploaded
cschu
parents:
diff changeset
155 pass
0916697409ea Uploaded
cschu
parents:
diff changeset
156
0916697409ea Uploaded
cschu
parents:
diff changeset
157 if __name__ == '__main__': main(sys.argv)