0
|
1 #!/usr/bin/env python
|
|
2 import sys
|
|
3 import argparse
|
|
4
|
|
5 def anabl_getContigsFromFASTQ(fn):
|
|
6 with open(fn) as fi:
|
|
7 for head in fi:
|
|
8 try:
|
|
9 seq, sep, qual = map(lambda x:x.strip(), [fi.next() for i in xrange(3)])
|
|
10 yield head[1:], seq, qual
|
|
11 except:
|
|
12 break
|
|
13 pass
|
|
14
|
|
15 def anabl_getContigsFromFASTA(fn):
|
|
16 with open(fn) as fi:
|
|
17 for head in fi:
|
|
18 try:
|
|
19 yield head[1:], fi.next().strip()
|
|
20 except:
|
|
21 break
|
|
22 pass
|
|
23
|
|
24 def toFASTQ(fields):
|
|
25 return '@%s\n%s\n+\n%s\n' % fields
|
|
26 def toFASTA(fields):
|
|
27 return '>%s\n%s\n' % fields
|
|
28 def getFASTQID(head):
|
|
29 return head.strip().split()[0].strip('@').strip('/1')
|
|
30 def getFASTAID(head):
|
|
31 return head.strip().split()[0].strip('>').strip('/1')
|
|
32
|
|
33
|
|
34
|
|
35 def main(argv):
|
|
36 DEFAULT_KRAKEN_DB= ''
|
|
37 parser = argparse.ArgumentParser(description='')
|
|
38 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db.') # we don't need taxonomy here
|
|
39 parser.add_argument('--in1', default='', help='Forward read file from metagenomic sample.')
|
|
40 parser.add_argument('--in2', default='', help='Reverse read file from metagenomic sample.')
|
|
41 parser.add_argument('--kraken-results1', help='File containing kraken classification for set1.')
|
|
42 parser.add_argument('--kraken-results2', help='File containing kraken classification for set2.')
|
|
43 parser.add_argument('--input-format', default='fa', help='fa (FASTA) or fq (FASTQ)')
|
|
44 parser.add_argument('--set1-output-left', type=str)
|
|
45 parser.add_argument('--set1-output-right', type=str)
|
|
46 parser.add_argument('--set2-output-left', type=str)
|
|
47 parser.add_argument('--set2-output-right', type=str)
|
|
48 parser.add_argument('--unclassified-output-left', type=str)
|
|
49 parser.add_argument('--unclassified-output-right', type=str)
|
|
50 parser.add_argument('--intersection-output-left', type=str)
|
|
51 parser.add_argument('--intersection-output-right', type=str)
|
|
52 args = parser.parse_args()
|
|
53
|
|
54 if args.input_format == 'fa':
|
|
55 getContigs = anabl_getContigsFromFASTA
|
|
56 writeFXRecord = toFASTA
|
|
57 getFXID = getFASTAID
|
|
58 elif args.input_format == 'fq':
|
|
59 getContigs = anabl_getContigsFromFASTQ
|
|
60 writeFXRecord = toFASTQ
|
|
61 getFXID = getFASTQID
|
|
62 else:
|
|
63 sys.stderr.write('Error: Unknown input format (%s).\n' % args.input_format)
|
|
64 sys.exit()
|
|
65
|
|
66 try:
|
|
67 set1 = [line.strip().split()[:2] for line in open(args.kraken_results1)]
|
|
68 except:
|
|
69 sys.stderr.write('Error: Set1 is missing, please specify --kraken_results1 parameter.\n')
|
|
70 sys.exit()
|
|
71
|
|
72 nReads = len(set1)
|
|
73 set1 = set(map(lambda x:x[1], filter(lambda x:x[0].startswith('C'), set1)))
|
|
74
|
|
75 try:
|
|
76 set2 = set([line.strip().split()[1] for line in open(args.kraken_results2) if line.startswith('C')])
|
|
77 except:
|
|
78 sys.stderr.write('Error: Set2 is missing, please specify --kraken_results2 parameter.\n')
|
|
79 sys.exit()
|
|
80
|
|
81 try:
|
|
82 set1_fwd = open(args.set1_output_left, 'wb')
|
|
83 set2_fwd = open(args.set2_output_left, 'wb')
|
|
84 noclass_fwd = open(args.unclassified_output_left, 'wb')
|
|
85 undecided_fwd = open(args.intersection_output_left, 'wb')
|
|
86 except:
|
|
87 sys.stderr.write('Error: Cannot open fwd outputfile(s).\n')
|
|
88 sys.exit()
|
|
89
|
|
90 try:
|
|
91 fwd = getContigs(args.in1)
|
|
92 except:
|
|
93 sys.stderr.write('Error: Cannot open file %s.\n' % args.in1)
|
|
94 sys.exit()
|
|
95
|
|
96 rev = None
|
|
97 if args.in2:
|
|
98 try:
|
|
99 rev = getContigs(args.in2)
|
|
100 except:
|
|
101 sys.stderr.write('Error: Cannot open file %s.\n' % args.in2)
|
|
102 sys.exit()
|
|
103 try:
|
|
104 set1_rev = open(args.set1_output_right, 'wb')
|
|
105 set2_rev = open(args.set2_output_right, 'wb')
|
|
106 noclass_rev = open(args.unclassified_output_right, 'wb')
|
|
107 undecided_rev = open(args.intersection_output_right, 'wb')
|
|
108 except:
|
|
109 sys.stderr.write('Error: Cannot open rev outputfile(s).\n')
|
|
110 sys.exit()
|
|
111
|
|
112
|
|
113 for i in xrange(nReads):
|
|
114 try:
|
|
115 r1 = fwd.next()
|
|
116 except:
|
|
117 break
|
|
118 r2 = None
|
|
119 if rev is not None:
|
|
120 try:
|
|
121 r2 = rev.next()
|
|
122 except:
|
|
123 break
|
|
124
|
|
125 id_ = getFXID(r1[0])
|
|
126
|
|
127 if id_ in set1:
|
|
128 if id_ not in set2:
|
|
129 set1_fwd.write(writeFXRecord(r1))
|
|
130 if set1_rev:
|
|
131 set1_rev.write(writeFXRecord(r2))
|
|
132 else:
|
|
133 undecided_fwd.write(writeFXRecord(r1))
|
|
134 if undecided_rev:
|
|
135 undecided_rev.write(writeFXRecord(r2))
|
|
136
|
|
137 elif id_ in set2:
|
|
138 set2_fwd.write(writeFXRecord(r1))
|
|
139 if set2_rev:
|
|
140 set2_rev.write(writeFXRecord(r2))
|
|
141 else:
|
|
142 noclass_fwd.write(writeFXRecord(r1))
|
|
143 if noclass_rev:
|
|
144 noclass_rev.write(writeFXRecord(r2))
|
|
145 pass
|
|
146
|
|
147 set1_fwd.close()
|
|
148 set1_rev.close()
|
|
149 set2_fwd.close()
|
|
150 set2_rev.close()
|
|
151 noclass_fwd.close()
|
|
152 noclass_rev.close()
|
|
153 undecided_fwd.close()
|
|
154 undecided_rev.close()
|
|
155 pass
|
|
156
|
|
157 if __name__ == '__main__': main(sys.argv)
|