Mercurial > repos > cschu > kraken_tools
comparison kraken_filter.py @ 14:fd27c97c8366 draft default tip
Uploaded
author | cschu |
---|---|
date | Mon, 18 May 2015 15:55:47 -0400 |
parents | 0916697409ea |
children |
comparison
equal
deleted
inserted
replaced
13:dc02dbf5e1a9 | 14:fd27c97c8366 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os | |
4 import sys | |
5 import argparse | |
6 import subprocess | |
7 import struct | |
8 import shutil | |
9 import tempfile | |
10 | |
11 from checkFileFormat import verifyFileFormat | |
12 | |
13 DEFAULT_KRAKEN_DB='/tsl/data/krakendb/ktest/db' | |
14 logfile = None | |
15 | |
16 | |
17 def getDescendents(taxID, tree): | |
18 descendents = set([taxID]) | |
19 queue = [taxID] | |
20 while queue: | |
21 node = queue.pop() | |
22 | |
23 children = tree.get(node, set()) | |
24 if children: | |
25 descendents = descendents.union(children) | |
26 queue.extend(children) | |
27 pass | |
28 return descendents | |
29 | |
30 def anabl_getSeqsFromFastX(fn, X=2): | |
31 it = open(fn) | |
32 for head in it: | |
33 try: | |
34 yield (head.strip(), tuple(map(lambda x:x.strip(), | |
35 ([it.next() for i in xrange(X - 1)])))) | |
36 except: | |
37 break | |
38 pass | |
39 it.close() | |
40 pass | |
41 | |
42 def readTaxonomy(db): | |
43 nodeNames = [] | |
44 for line in open(os.path.join(db, 'taxonomy', 'names.dmp')): | |
45 line = line.strip().strip('|').strip() | |
46 if not line: break | |
47 line = line.split('\t|\t') | |
48 | |
49 if line[3].strip() == 'scientific name': | |
50 nodeNames.append((int(line[0].strip()), line[1].strip())) | |
51 # break | |
52 pass | |
53 | |
54 nodeRanks = [] | |
55 nodeChildren = {} | |
56 for line in open(os.path.join(db, 'taxonomy', 'nodes.dmp')): | |
57 line = line.strip().strip('|').strip() | |
58 if not line: break | |
59 line = map(lambda x:x.strip(), line.split('\t|\t')) | |
60 line[:2] = map(int, line[:2]) | |
61 if line[0] == 1: | |
62 line[1] = 0 | |
63 try: | |
64 nodeChildren[line[1]].add(line[0]) | |
65 except: | |
66 nodeChildren[line[1]] = set([line[0]]) | |
67 nodeRanks.append((line[0], line[2])) | |
68 # break | |
69 | |
70 return dict(nodeNames), dict(nodeRanks), nodeChildren | |
71 | |
72 def getFastqIdentifier(string): | |
73 if string.endswith('/1') or string.endswith('/2'): | |
74 return string[1:-2] | |
75 else: | |
76 return string.split()[0][1:] | |
77 | |
78 def runFilter(db, taxID, inputClassification, inputR1, outputR1, inputR2=None, outputR2=None, fastx=4, debug=False): | |
79 global logfile | |
80 logfile.write('Reading taxonomy...\n') | |
81 logfile.flush() | |
82 taxonomyInfo = readTaxonomy(db) | |
83 keepSequences = set() | |
84 logfile.write('Traversing requested taxonomy branch...\n') | |
85 logfile.flush() | |
86 validIDs = getDescendents(taxID, taxonomyInfo[2]) | |
87 logfile.write('Family tree of %s has %i descendents.\n' % (str(taxID), len(validIDs))) | |
88 logfile.flush() | |
89 if debug: | |
90 for vid in validIDs: | |
91 logfile.write('Added %s to validIDs.\n' % vid) | |
92 logfile.flush() | |
93 | |
94 | |
95 logfile.write('Filtering sequences...\n') | |
96 logfile.flush() | |
97 nseqs = 0 | |
98 for line in open(inputClassification): | |
99 nseqs += 1 | |
100 line = line.strip().split() | |
101 takeUnclassified = taxID == 0 and line[0] == 'U' | |
102 takeClassified = line[0] == 'C' and int(line[2]) in validIDs | |
103 if takeUnclassified or takeClassified: | |
104 keepSequences.add(line[1].strip()) | |
105 if debug: | |
106 logfile.write('Added %s to keepSequences.\n' % line[1].strip()) | |
107 logfile.flush() | |
108 logfile.write('Keeping %i of %i sequences (%.1f).\n' % (len(keepSequences), nseqs, float(len(keepSequences))/nseqs)) | |
109 logfile.flush() | |
110 logfile.write('Writing filtered sequence sets...\n') | |
111 logfile.flush() | |
112 fwdOut = open(outputR1, 'wb') | |
113 fwdGen = anabl_getSeqsFromFastX(inputR1, X=fastx) | |
114 | |
115 revOut, revGen = None, None | |
116 revSid, revSeq = None, None | |
117 if outputR2 is not None and inputR2 is not None: | |
118 revOut = open(outputR2, 'wb') | |
119 revGen = anabl_getSeqsFromFastX(inputR2, X=fastx) | |
120 | |
121 fqid1, fqid2 = None, None | |
122 while True: | |
123 try: | |
124 fwdSid, fwdSeq = fwdGen.next() | |
125 fqid1 = getFastqIdentifier(fwdSid) | |
126 if revGen is not None: | |
127 revSid, revSeq = revGen.next() | |
128 fqid2 = getFastqIdentifier(revSid) | |
129 except: | |
130 break | |
131 if fqid1 != fqid2 and fqid2 is not None: | |
132 sys.stderr.write('Error: fqid-mismatch %s %s.\n' % (fqid1, fqid2)) | |
133 sys.exit(1) | |
134 if fqid1 in keepSequences: | |
135 fwdOut.write(('%s\n' * fastx) % ((fwdSid,) + fwdSeq)) | |
136 if revOut is not None: | |
137 revOut.write(('%s\n' * fastx) % ((revSid,) + revSeq)) | |
138 else: | |
139 # sys.stdout.write('%s is not in keepSequences\n' % fqid1) | |
140 pass | |
141 fwdOut.close() | |
142 if revOut is not None: | |
143 revOut.close() | |
144 pass | |
145 | |
146 | |
147 def main(argv): | |
148 | |
149 descr = '' | |
150 parser = argparse.ArgumentParser(description=descr) | |
151 parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?') | |
152 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db') | |
153 parser.add_argument('--in1', help='The r1-file (single-end reads or left paired-end reads).') | |
154 parser.add_argument('--in2', help='The r2-file (right paired-end reads)') | |
155 parser.add_argument('--taxid', default=1, help='The root taxon id of the requested branch', type=int) | |
156 parser.add_argument('--kraken-results', type=str, help='A file containing kraken classification results for the input sequences.') | |
157 parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq') | |
158 | |
159 parser.add_argument('--out1', type=str, help='The r1-output file.') | |
160 parser.add_argument('--out2', type=str, help='The r2-output file.') | |
161 parser.add_argument('--debug', action='store_true') | |
162 parser.add_argument('--logfile', type=str, help='A logfile.', default='kraken_filter.log') | |
163 args = parser.parse_args() | |
164 | |
165 kraken_params = [] | |
166 kraken_input = [] | |
167 | |
168 global logfile | |
169 logfile = open(args.logfile, 'wb') | |
170 if 'db' not in args or not os.path.exists(args.db): | |
171 sys.stderr.write('Error: database is missing.\n') | |
172 sys.exit(1) | |
173 kraken_params.extend(['--db', args.db]) | |
174 # check whether input file(s) exist(s) | |
175 | |
176 if 'kraken_results' in args and os.path.exists(args.kraken_results): | |
177 kraken_input.append(args.kraken_results) | |
178 else: | |
179 sys.stderr.write('Error: kraken-classification is missing.\n') | |
180 sys.exit(1) | |
181 pass | |
182 | |
183 if not ('in1' in args and os.path.exists(args.in1)): | |
184 sys.stderr.write('Error: left/single input file (%s) is missing.\n' % args.in1) | |
185 sys.exit(1) | |
186 if not verifyFileFormat(args.in1, args.input_format): | |
187 sys.stderr.write('Error: fwd/single input file has the wrong format assigned.\n') | |
188 sys.exit(1) | |
189 | |
190 kraken_input.append(args.in1) | |
191 if 'in2' in args: | |
192 if args.in2 is not None and not os.path.exists(args.in2): | |
193 sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2) | |
194 sys.exit(1) | |
195 elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format): | |
196 sys.stderr.write('Error: rev input file has the wrong format assigned.\n') | |
197 sys.exit(1) | |
198 elif args.in2 is not None: | |
199 kraken_params.append('--paired') | |
200 kraken_input.append(args.in2) | |
201 else: | |
202 pass | |
203 else: | |
204 pass | |
205 | |
206 if 'out2' in args and 'in2' in args: | |
207 input2, output2 = args.in2, args.out2 | |
208 else: | |
209 input2, output2 = None, None | |
210 | |
211 if args.taxid < 0: | |
212 sys.stderr.write('Error: invalid taxon id %i\n' % args.taxid) | |
213 sys.exit(1) | |
214 kraken_params.extend(['--taxid', args.taxid]) | |
215 | |
216 if args.input_format == 'fq': | |
217 kraken_params.append('--fastq-input') | |
218 fastx = 4 | |
219 else: | |
220 kraken_params.append('--fasta-input') | |
221 fastx = 2 | |
222 | |
223 | |
224 | |
225 # firstChar = open(args.in1).read(1) | |
226 # if firstChar == '>': | |
227 # kraken_params.append('--fasta-input') | |
228 # fastx = 2 | |
229 # elif firstChar == '@': | |
230 # kraken_params.append('--fastq-input') | |
231 # fastx = 4 | |
232 #else: | |
233 # """ This will currently disable working with gzipped/bzipped files. """ | |
234 # sys.stderr.write('Error: Input file starts with unknown symbol %s.\n' % firstChar) | |
235 # sys.exit(1) | |
236 # pass | |
237 | |
238 #open(args.kraken_filtered_r1, 'wb').write('\n'.join(map(str, kraken_params + kraken_input))) | |
239 | |
240 runFilter(args.db, int(args.taxid), args.kraken_results, | |
241 args.in1, args.out1, | |
242 inputR2=input2, outputR2=output2, fastx=fastx, | |
243 debug=args.debug) | |
244 | |
245 logfile.close() | |
246 | |
247 pass | |
248 | |
249 | |
250 | |
251 | |
252 # main(sys.argv[1:]) | |
253 | |
254 if __name__ == '__main__': main(sys.argv[1:]) |