0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import os
|
|
4 import sys
|
|
5 import argparse
|
|
6 import subprocess
|
|
7 import struct
|
|
8 import shutil
|
|
9 import tempfile
|
|
10
|
|
11 from checkFileFormat import verifyFileFormat
|
|
12
|
|
13 DEFAULT_KRAKEN_DB='/tsl/data/krakendb/ktest/db'
|
|
14 logfile = None
|
|
15
|
|
16
|
|
17 def getDescendents(taxID, tree):
|
|
18 descendents = set([taxID])
|
|
19 queue = [taxID]
|
|
20 while queue:
|
|
21 node = queue.pop()
|
|
22
|
|
23 children = tree.get(node, set())
|
|
24 if children:
|
|
25 descendents = descendents.union(children)
|
|
26 queue.extend(children)
|
|
27 pass
|
|
28 return descendents
|
|
29
|
|
30 def anabl_getSeqsFromFastX(fn, X=2):
|
|
31 it = open(fn)
|
|
32 for head in it:
|
|
33 try:
|
|
34 yield (head.strip(), tuple(map(lambda x:x.strip(),
|
|
35 ([it.next() for i in xrange(X - 1)]))))
|
|
36 except:
|
|
37 break
|
|
38 pass
|
|
39 it.close()
|
|
40 pass
|
|
41
|
|
42 def readTaxonomy(db):
|
|
43 nodeNames = []
|
|
44 for line in open(os.path.join(db, 'taxonomy', 'names.dmp')):
|
|
45 line = line.strip().strip('|').strip()
|
|
46 if not line: break
|
|
47 line = line.split('\t|\t')
|
|
48
|
|
49 if line[3].strip() == 'scientific name':
|
|
50 nodeNames.append((int(line[0].strip()), line[1].strip()))
|
|
51 # break
|
|
52 pass
|
|
53
|
|
54 nodeRanks = []
|
|
55 nodeChildren = {}
|
|
56 for line in open(os.path.join(db, 'taxonomy', 'nodes.dmp')):
|
|
57 line = line.strip().strip('|').strip()
|
|
58 if not line: break
|
|
59 line = map(lambda x:x.strip(), line.split('\t|\t'))
|
|
60 line[:2] = map(int, line[:2])
|
|
61 if line[0] == 1:
|
|
62 line[1] = 0
|
|
63 try:
|
|
64 nodeChildren[line[1]].add(line[0])
|
|
65 except:
|
|
66 nodeChildren[line[1]] = set([line[0]])
|
|
67 nodeRanks.append((line[0], line[2]))
|
|
68 # break
|
|
69
|
|
70 return dict(nodeNames), dict(nodeRanks), nodeChildren
|
|
71
|
|
72 def getFastqIdentifier(string):
|
|
73 if string.endswith('/1') or string.endswith('/2'):
|
|
74 return string[1:-2]
|
|
75 else:
|
|
76 return string.split()[0][1:]
|
|
77
|
|
78 def runFilter(db, taxID, inputClassification, inputR1, outputR1, inputR2=None, outputR2=None, fastx=4, debug=False):
|
|
79 global logfile
|
|
80 logfile.write('Reading taxonomy...\n')
|
|
81 logfile.flush()
|
|
82 taxonomyInfo = readTaxonomy(db)
|
|
83 keepSequences = set()
|
|
84 logfile.write('Traversing requested taxonomy branch...\n')
|
|
85 logfile.flush()
|
|
86 validIDs = getDescendents(taxID, taxonomyInfo[2])
|
|
87 logfile.write('Family tree of %s has %i descendents.\n' % (str(taxID), len(validIDs)))
|
|
88 logfile.flush()
|
|
89 if debug:
|
|
90 for vid in validIDs:
|
|
91 logfile.write('Added %s to validIDs.\n' % vid)
|
|
92 logfile.flush()
|
|
93
|
|
94
|
|
95 logfile.write('Filtering sequences...\n')
|
|
96 logfile.flush()
|
|
97 nseqs = 0
|
|
98 for line in open(inputClassification):
|
|
99 nseqs += 1
|
|
100 line = line.strip().split()
|
|
101 takeUnclassified = taxID == 0 and line[0] == 'U'
|
|
102 takeClassified = line[0] == 'C' and int(line[2]) in validIDs
|
|
103 if takeUnclassified or takeClassified:
|
|
104 keepSequences.add(line[1].strip())
|
|
105 if debug:
|
|
106 logfile.write('Added %s to keepSequences.\n' % line[1].strip())
|
|
107 logfile.flush()
|
|
108 logfile.write('Keeping %i of %i sequences (%.1f).\n' % (len(keepSequences), nseqs, float(len(keepSequences))/nseqs))
|
|
109 logfile.flush()
|
|
110 logfile.write('Writing filtered sequence sets...\n')
|
|
111 logfile.flush()
|
|
112 fwdOut = open(outputR1, 'wb')
|
|
113 fwdGen = anabl_getSeqsFromFastX(inputR1, X=fastx)
|
|
114
|
|
115 revOut, revGen = None, None
|
|
116 revSid, revSeq = None, None
|
|
117 if outputR2 is not None and inputR2 is not None:
|
|
118 revOut = open(outputR2, 'wb')
|
|
119 revGen = anabl_getSeqsFromFastX(inputR2, X=fastx)
|
|
120
|
|
121 fqid1, fqid2 = None, None
|
|
122 while True:
|
|
123 try:
|
|
124 fwdSid, fwdSeq = fwdGen.next()
|
|
125 fqid1 = getFastqIdentifier(fwdSid)
|
|
126 if revGen is not None:
|
|
127 revSid, revSeq = revGen.next()
|
|
128 fqid2 = getFastqIdentifier(revSid)
|
|
129 except:
|
|
130 break
|
|
131 if fqid1 != fqid2 and fqid2 is not None:
|
|
132 sys.stderr.write('Error: fqid-mismatch %s %s.\n' % (fqid1, fqid2))
|
|
133 sys.exit(1)
|
|
134 if fqid1 in keepSequences:
|
|
135 fwdOut.write(('%s\n' * fastx) % ((fwdSid,) + fwdSeq))
|
|
136 if revOut is not None:
|
|
137 revOut.write(('%s\n' * fastx) % ((revSid,) + revSeq))
|
|
138 else:
|
|
139 # sys.stdout.write('%s is not in keepSequences\n' % fqid1)
|
|
140 pass
|
|
141 fwdOut.close()
|
|
142 if revOut is not None:
|
|
143 revOut.close()
|
|
144 pass
|
|
145
|
|
146
|
|
147 def main(argv):
|
|
148
|
|
149 descr = ''
|
|
150 parser = argparse.ArgumentParser(description=descr)
|
|
151 parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?')
|
|
152 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db')
|
|
153 parser.add_argument('--in1', help='The r1-file (single-end reads or left paired-end reads).')
|
|
154 parser.add_argument('--in2', help='The r2-file (right paired-end reads)')
|
|
155 parser.add_argument('--taxid', default=1, help='The root taxon id of the requested branch', type=int)
|
|
156 parser.add_argument('--kraken-results', type=str, help='A file containing kraken classification results for the input sequences.')
|
|
157 parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq')
|
|
158
|
|
159 parser.add_argument('--out1', type=str, help='The r1-output file.')
|
|
160 parser.add_argument('--out2', type=str, help='The r2-output file.')
|
|
161 parser.add_argument('--debug', action='store_true')
|
|
162 parser.add_argument('--logfile', type=str, help='A logfile.', default='kraken_filter.log')
|
|
163 args = parser.parse_args()
|
|
164
|
|
165 kraken_params = []
|
|
166 kraken_input = []
|
|
167
|
|
168 global logfile
|
|
169 logfile = open(args.logfile, 'wb')
|
|
170 if 'db' not in args or not os.path.exists(args.db):
|
|
171 sys.stderr.write('Error: database is missing.\n')
|
|
172 sys.exit(1)
|
|
173 kraken_params.extend(['--db', args.db])
|
|
174 # check whether input file(s) exist(s)
|
|
175
|
|
176 if 'kraken_results' in args and os.path.exists(args.kraken_results):
|
|
177 kraken_input.append(args.kraken_results)
|
|
178 else:
|
|
179 sys.stderr.write('Error: kraken-classification is missing.\n')
|
|
180 sys.exit(1)
|
|
181 pass
|
|
182
|
|
183 if not ('in1' in args and os.path.exists(args.in1)):
|
|
184 sys.stderr.write('Error: left/single input file (%s) is missing.\n' % args.in1)
|
|
185 sys.exit(1)
|
|
186 if not verifyFileFormat(args.in1, args.input_format):
|
|
187 sys.stderr.write('Error: fwd/single input file has the wrong format assigned.\n')
|
|
188 sys.exit(1)
|
|
189
|
|
190 kraken_input.append(args.in1)
|
|
191 if 'in2' in args:
|
|
192 if args.in2 is not None and not os.path.exists(args.in2):
|
|
193 sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2)
|
|
194 sys.exit(1)
|
|
195 elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format):
|
|
196 sys.stderr.write('Error: rev input file has the wrong format assigned.\n')
|
|
197 sys.exit(1)
|
|
198 elif args.in2 is not None:
|
|
199 kraken_params.append('--paired')
|
|
200 kraken_input.append(args.in2)
|
|
201 else:
|
|
202 pass
|
|
203 else:
|
|
204 pass
|
|
205
|
|
206 if 'out2' in args and 'in2' in args:
|
|
207 input2, output2 = args.in2, args.out2
|
|
208 else:
|
|
209 input2, output2 = None, None
|
|
210
|
|
211 if args.taxid < 0:
|
|
212 sys.stderr.write('Error: invalid taxon id %i\n' % args.taxid)
|
|
213 sys.exit(1)
|
|
214 kraken_params.extend(['--taxid', args.taxid])
|
|
215
|
|
216 if args.input_format == 'fq':
|
|
217 kraken_params.append('--fastq-input')
|
|
218 fastx = 4
|
|
219 else:
|
|
220 kraken_params.append('--fasta-input')
|
|
221 fastx = 2
|
|
222
|
|
223
|
|
224
|
|
225 # firstChar = open(args.in1).read(1)
|
|
226 # if firstChar == '>':
|
|
227 # kraken_params.append('--fasta-input')
|
|
228 # fastx = 2
|
|
229 # elif firstChar == '@':
|
|
230 # kraken_params.append('--fastq-input')
|
|
231 # fastx = 4
|
|
232 #else:
|
|
233 # """ This will currently disable working with gzipped/bzipped files. """
|
|
234 # sys.stderr.write('Error: Input file starts with unknown symbol %s.\n' % firstChar)
|
|
235 # sys.exit(1)
|
|
236 # pass
|
|
237
|
|
238 #open(args.kraken_filtered_r1, 'wb').write('\n'.join(map(str, kraken_params + kraken_input)))
|
|
239
|
|
240 runFilter(args.db, int(args.taxid), args.kraken_results,
|
|
241 args.in1, args.out1,
|
|
242 inputR2=input2, outputR2=output2, fastx=fastx,
|
|
243 debug=args.debug)
|
|
244
|
|
245 logfile.close()
|
|
246
|
|
247 pass
|
|
248
|
|
249
|
|
250
|
|
251
|
|
252 # main(sys.argv[1:])
|
|
253
|
|
254 if __name__ == '__main__': main(sys.argv[1:])
|