comparison kraken_combine.py @ 14:fd27c97c8366 draft default tip

Uploaded
author cschu
date Mon, 18 May 2015 15:55:47 -0400
parents 0916697409ea
children
comparison
equal deleted inserted replaced
13:dc02dbf5e1a9 14:fd27c97c8366
1 #!/usr/bin/env python
2 import sys
3 import argparse
4
5 def anabl_getContigsFromFASTQ(fn):
6 with open(fn) as fi:
7 for head in fi:
8 try:
9 seq, sep, qual = map(lambda x:x.strip(), [fi.next() for i in xrange(3)])
10 yield head[1:], seq, qual
11 except:
12 break
13 pass
14
15 def anabl_getContigsFromFASTA(fn):
16 with open(fn) as fi:
17 for head in fi:
18 try:
19 yield head[1:], fi.next().strip()
20 except:
21 break
22 pass
23
24 def toFASTQ(fields):
25 return '@%s\n%s\n+\n%s\n' % fields
26 def toFASTA(fields):
27 return '>%s\n%s\n' % fields
28 def getFASTQID(head):
29 return head.strip().split()[0].strip('@').strip('/1')
30 def getFASTAID(head):
31 return head.strip().split()[0].strip('>').strip('/1')
32
33
34
35 def main(argv):
36 DEFAULT_KRAKEN_DB= ''
37 parser = argparse.ArgumentParser(description='')
38 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db.') # we don't need taxonomy here
39 parser.add_argument('--in1', default='', help='Forward read file from metagenomic sample.')
40 parser.add_argument('--in2', default='', help='Reverse read file from metagenomic sample.')
41 parser.add_argument('--kraken-results1', help='File containing kraken classification for set1.')
42 parser.add_argument('--kraken-results2', help='File containing kraken classification for set2.')
43 parser.add_argument('--input-format', default='fa', help='fa (FASTA) or fq (FASTQ)')
44 parser.add_argument('--set1-output-left', type=str)
45 parser.add_argument('--set1-output-right', type=str)
46 parser.add_argument('--set2-output-left', type=str)
47 parser.add_argument('--set2-output-right', type=str)
48 parser.add_argument('--unclassified-output-left', type=str)
49 parser.add_argument('--unclassified-output-right', type=str)
50 parser.add_argument('--intersection-output-left', type=str)
51 parser.add_argument('--intersection-output-right', type=str)
52 args = parser.parse_args()
53
54 if args.input_format == 'fa':
55 getContigs = anabl_getContigsFromFASTA
56 writeFXRecord = toFASTA
57 getFXID = getFASTAID
58 elif args.input_format == 'fq':
59 getContigs = anabl_getContigsFromFASTQ
60 writeFXRecord = toFASTQ
61 getFXID = getFASTQID
62 else:
63 sys.stderr.write('Error: Unknown input format (%s).\n' % args.input_format)
64 sys.exit()
65
66 try:
67 set1 = [line.strip().split()[:2] for line in open(args.kraken_results1)]
68 except:
69 sys.stderr.write('Error: Set1 is missing, please specify --kraken_results1 parameter.\n')
70 sys.exit()
71
72 nReads = len(set1)
73 set1 = set(map(lambda x:x[1], filter(lambda x:x[0].startswith('C'), set1)))
74
75 try:
76 set2 = set([line.strip().split()[1] for line in open(args.kraken_results2) if line.startswith('C')])
77 except:
78 sys.stderr.write('Error: Set2 is missing, please specify --kraken_results2 parameter.\n')
79 sys.exit()
80
81 try:
82 set1_fwd = open(args.set1_output_left, 'wb')
83 set2_fwd = open(args.set2_output_left, 'wb')
84 noclass_fwd = open(args.unclassified_output_left, 'wb')
85 undecided_fwd = open(args.intersection_output_left, 'wb')
86 except:
87 sys.stderr.write('Error: Cannot open fwd outputfile(s).\n')
88 sys.exit()
89
90 try:
91 fwd = getContigs(args.in1)
92 except:
93 sys.stderr.write('Error: Cannot open file %s.\n' % args.in1)
94 sys.exit()
95
96 rev = None
97 if args.in2:
98 try:
99 rev = getContigs(args.in2)
100 except:
101 sys.stderr.write('Error: Cannot open file %s.\n' % args.in2)
102 sys.exit()
103 try:
104 set1_rev = open(args.set1_output_right, 'wb')
105 set2_rev = open(args.set2_output_right, 'wb')
106 noclass_rev = open(args.unclassified_output_right, 'wb')
107 undecided_rev = open(args.intersection_output_right, 'wb')
108 except:
109 sys.stderr.write('Error: Cannot open rev outputfile(s).\n')
110 sys.exit()
111
112
113 for i in xrange(nReads):
114 try:
115 r1 = fwd.next()
116 except:
117 break
118 r2 = None
119 if rev is not None:
120 try:
121 r2 = rev.next()
122 except:
123 break
124
125 id_ = getFXID(r1[0])
126
127 if id_ in set1:
128 if id_ not in set2:
129 set1_fwd.write(writeFXRecord(r1))
130 if set1_rev:
131 set1_rev.write(writeFXRecord(r2))
132 else:
133 undecided_fwd.write(writeFXRecord(r1))
134 if undecided_rev:
135 undecided_rev.write(writeFXRecord(r2))
136
137 elif id_ in set2:
138 set2_fwd.write(writeFXRecord(r1))
139 if set2_rev:
140 set2_rev.write(writeFXRecord(r2))
141 else:
142 noclass_fwd.write(writeFXRecord(r1))
143 if noclass_rev:
144 noclass_rev.write(writeFXRecord(r2))
145 pass
146
147 set1_fwd.close()
148 set1_rev.close()
149 set2_fwd.close()
150 set2_rev.close()
151 noclass_fwd.close()
152 noclass_rev.close()
153 undecided_fwd.close()
154 undecided_rev.close()
155 pass
156
157 if __name__ == '__main__': main(sys.argv)