comparison kraken_filter.py @ 14:fd27c97c8366 draft default tip

Uploaded
author cschu
date Mon, 18 May 2015 15:55:47 -0400
parents 0916697409ea
children
comparison
equal deleted inserted replaced
13:dc02dbf5e1a9 14:fd27c97c8366
1 #!/usr/bin/env python
2
3 import os
4 import sys
5 import argparse
6 import subprocess
7 import struct
8 import shutil
9 import tempfile
10
11 from checkFileFormat import verifyFileFormat
12
13 DEFAULT_KRAKEN_DB='/tsl/data/krakendb/ktest/db'
14 logfile = None
15
16
17 def getDescendents(taxID, tree):
18 descendents = set([taxID])
19 queue = [taxID]
20 while queue:
21 node = queue.pop()
22
23 children = tree.get(node, set())
24 if children:
25 descendents = descendents.union(children)
26 queue.extend(children)
27 pass
28 return descendents
29
30 def anabl_getSeqsFromFastX(fn, X=2):
31 it = open(fn)
32 for head in it:
33 try:
34 yield (head.strip(), tuple(map(lambda x:x.strip(),
35 ([it.next() for i in xrange(X - 1)]))))
36 except:
37 break
38 pass
39 it.close()
40 pass
41
42 def readTaxonomy(db):
43 nodeNames = []
44 for line in open(os.path.join(db, 'taxonomy', 'names.dmp')):
45 line = line.strip().strip('|').strip()
46 if not line: break
47 line = line.split('\t|\t')
48
49 if line[3].strip() == 'scientific name':
50 nodeNames.append((int(line[0].strip()), line[1].strip()))
51 # break
52 pass
53
54 nodeRanks = []
55 nodeChildren = {}
56 for line in open(os.path.join(db, 'taxonomy', 'nodes.dmp')):
57 line = line.strip().strip('|').strip()
58 if not line: break
59 line = map(lambda x:x.strip(), line.split('\t|\t'))
60 line[:2] = map(int, line[:2])
61 if line[0] == 1:
62 line[1] = 0
63 try:
64 nodeChildren[line[1]].add(line[0])
65 except:
66 nodeChildren[line[1]] = set([line[0]])
67 nodeRanks.append((line[0], line[2]))
68 # break
69
70 return dict(nodeNames), dict(nodeRanks), nodeChildren
71
72 def getFastqIdentifier(string):
73 if string.endswith('/1') or string.endswith('/2'):
74 return string[1:-2]
75 else:
76 return string.split()[0][1:]
77
78 def runFilter(db, taxID, inputClassification, inputR1, outputR1, inputR2=None, outputR2=None, fastx=4, debug=False):
79 global logfile
80 logfile.write('Reading taxonomy...\n')
81 logfile.flush()
82 taxonomyInfo = readTaxonomy(db)
83 keepSequences = set()
84 logfile.write('Traversing requested taxonomy branch...\n')
85 logfile.flush()
86 validIDs = getDescendents(taxID, taxonomyInfo[2])
87 logfile.write('Family tree of %s has %i descendents.\n' % (str(taxID), len(validIDs)))
88 logfile.flush()
89 if debug:
90 for vid in validIDs:
91 logfile.write('Added %s to validIDs.\n' % vid)
92 logfile.flush()
93
94
95 logfile.write('Filtering sequences...\n')
96 logfile.flush()
97 nseqs = 0
98 for line in open(inputClassification):
99 nseqs += 1
100 line = line.strip().split()
101 takeUnclassified = taxID == 0 and line[0] == 'U'
102 takeClassified = line[0] == 'C' and int(line[2]) in validIDs
103 if takeUnclassified or takeClassified:
104 keepSequences.add(line[1].strip())
105 if debug:
106 logfile.write('Added %s to keepSequences.\n' % line[1].strip())
107 logfile.flush()
108 logfile.write('Keeping %i of %i sequences (%.1f).\n' % (len(keepSequences), nseqs, float(len(keepSequences))/nseqs))
109 logfile.flush()
110 logfile.write('Writing filtered sequence sets...\n')
111 logfile.flush()
112 fwdOut = open(outputR1, 'wb')
113 fwdGen = anabl_getSeqsFromFastX(inputR1, X=fastx)
114
115 revOut, revGen = None, None
116 revSid, revSeq = None, None
117 if outputR2 is not None and inputR2 is not None:
118 revOut = open(outputR2, 'wb')
119 revGen = anabl_getSeqsFromFastX(inputR2, X=fastx)
120
121 fqid1, fqid2 = None, None
122 while True:
123 try:
124 fwdSid, fwdSeq = fwdGen.next()
125 fqid1 = getFastqIdentifier(fwdSid)
126 if revGen is not None:
127 revSid, revSeq = revGen.next()
128 fqid2 = getFastqIdentifier(revSid)
129 except:
130 break
131 if fqid1 != fqid2 and fqid2 is not None:
132 sys.stderr.write('Error: fqid-mismatch %s %s.\n' % (fqid1, fqid2))
133 sys.exit(1)
134 if fqid1 in keepSequences:
135 fwdOut.write(('%s\n' * fastx) % ((fwdSid,) + fwdSeq))
136 if revOut is not None:
137 revOut.write(('%s\n' * fastx) % ((revSid,) + revSeq))
138 else:
139 # sys.stdout.write('%s is not in keepSequences\n' % fqid1)
140 pass
141 fwdOut.close()
142 if revOut is not None:
143 revOut.close()
144 pass
145
146
147 def main(argv):
148
149 descr = ''
150 parser = argparse.ArgumentParser(description=descr)
151 parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?')
152 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db')
153 parser.add_argument('--in1', help='The r1-file (single-end reads or left paired-end reads).')
154 parser.add_argument('--in2', help='The r2-file (right paired-end reads)')
155 parser.add_argument('--taxid', default=1, help='The root taxon id of the requested branch', type=int)
156 parser.add_argument('--kraken-results', type=str, help='A file containing kraken classification results for the input sequences.')
157 parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq')
158
159 parser.add_argument('--out1', type=str, help='The r1-output file.')
160 parser.add_argument('--out2', type=str, help='The r2-output file.')
161 parser.add_argument('--debug', action='store_true')
162 parser.add_argument('--logfile', type=str, help='A logfile.', default='kraken_filter.log')
163 args = parser.parse_args()
164
165 kraken_params = []
166 kraken_input = []
167
168 global logfile
169 logfile = open(args.logfile, 'wb')
170 if 'db' not in args or not os.path.exists(args.db):
171 sys.stderr.write('Error: database is missing.\n')
172 sys.exit(1)
173 kraken_params.extend(['--db', args.db])
174 # check whether input file(s) exist(s)
175
176 if 'kraken_results' in args and os.path.exists(args.kraken_results):
177 kraken_input.append(args.kraken_results)
178 else:
179 sys.stderr.write('Error: kraken-classification is missing.\n')
180 sys.exit(1)
181 pass
182
183 if not ('in1' in args and os.path.exists(args.in1)):
184 sys.stderr.write('Error: left/single input file (%s) is missing.\n' % args.in1)
185 sys.exit(1)
186 if not verifyFileFormat(args.in1, args.input_format):
187 sys.stderr.write('Error: fwd/single input file has the wrong format assigned.\n')
188 sys.exit(1)
189
190 kraken_input.append(args.in1)
191 if 'in2' in args:
192 if args.in2 is not None and not os.path.exists(args.in2):
193 sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2)
194 sys.exit(1)
195 elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format):
196 sys.stderr.write('Error: rev input file has the wrong format assigned.\n')
197 sys.exit(1)
198 elif args.in2 is not None:
199 kraken_params.append('--paired')
200 kraken_input.append(args.in2)
201 else:
202 pass
203 else:
204 pass
205
206 if 'out2' in args and 'in2' in args:
207 input2, output2 = args.in2, args.out2
208 else:
209 input2, output2 = None, None
210
211 if args.taxid < 0:
212 sys.stderr.write('Error: invalid taxon id %i\n' % args.taxid)
213 sys.exit(1)
214 kraken_params.extend(['--taxid', args.taxid])
215
216 if args.input_format == 'fq':
217 kraken_params.append('--fastq-input')
218 fastx = 4
219 else:
220 kraken_params.append('--fasta-input')
221 fastx = 2
222
223
224
225 # firstChar = open(args.in1).read(1)
226 # if firstChar == '>':
227 # kraken_params.append('--fasta-input')
228 # fastx = 2
229 # elif firstChar == '@':
230 # kraken_params.append('--fastq-input')
231 # fastx = 4
232 #else:
233 # """ This will currently disable working with gzipped/bzipped files. """
234 # sys.stderr.write('Error: Input file starts with unknown symbol %s.\n' % firstChar)
235 # sys.exit(1)
236 # pass
237
238 #open(args.kraken_filtered_r1, 'wb').write('\n'.join(map(str, kraken_params + kraken_input)))
239
240 runFilter(args.db, int(args.taxid), args.kraken_results,
241 args.in1, args.out1,
242 inputR2=input2, outputR2=output2, fastx=fastx,
243 debug=args.debug)
244
245 logfile.close()
246
247 pass
248
249
250
251
252 # main(sys.argv[1:])
253
254 if __name__ == '__main__': main(sys.argv[1:])