45
|
1 #! /usr/bin/env python2
|
|
2 #
|
|
3 # This file is part of khmer, http://github.com/ged-lab/khmer/, and is
|
|
4 # Copyright (C) Michigan State University, 2009-2013. It is licensed under
|
|
5 # the three-clause BSD license; see doc/LICENSE.txt.
|
|
6 # Contact: khmer-project@idyll.org
|
|
7 #
|
|
8 import sys
|
|
9 import screed.fasta
|
|
10 import os
|
|
11 import khmer
|
|
12 from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter
|
|
13
|
|
14 WORKER_THREADS = 8
|
|
15 GROUPSIZE = 100
|
|
16
|
|
17 CUTOFF = 50
|
|
18
|
|
19 ###
|
|
20
|
|
21
|
|
22 def main():
|
|
23 counting_ht = sys.argv[1]
|
|
24 infiles = sys.argv[2:]
|
|
25
|
|
26 print 'file with ht: %s' % counting_ht
|
|
27 print '-- settings:'
|
|
28 print 'N THREADS', WORKER_THREADS
|
|
29 print '--'
|
|
30
|
|
31 print 'making hashtable'
|
|
32 ht = khmer.load_counting_hash(counting_ht)
|
|
33 K = ht.ksize()
|
|
34
|
|
35 for infile in infiles:
|
|
36 print 'filtering', infile
|
|
37 outfile = os.path.basename(infile) + '.below'
|
|
38
|
|
39 outfp = open(outfile, 'w')
|
|
40
|
|
41 def process_fn(record, ht=ht):
|
|
42 name = record['name']
|
|
43 seq = record['sequence']
|
|
44 if 'N' in seq:
|
|
45 return None, None
|
|
46
|
|
47 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)
|
|
48
|
|
49 if trim_at >= K:
|
|
50 return name, trim_seq
|
|
51
|
|
52 return None, None
|
|
53
|
|
54 tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
|
|
55
|
|
56 tsp.start(verbose_fasta_iter(infile), outfp)
|
|
57
|
|
58 if __name__ == '__main__':
|
|
59 main()
|