comparison filter-below-abund.py @ 45:0b238b083f77

2 more tools
author Michael R. Crusoe <mcrusoe@msu.edu>
date Sat, 12 Jul 2014 11:13:21 -0400
parents
children fe697e0cb24a
comparison
equal deleted inserted replaced
44:46d13bbb21f2 45:0b238b083f77
1 #! /usr/bin/env python2
2 #
3 # This file is part of khmer, http://github.com/ged-lab/khmer/, and is
4 # Copyright (C) Michigan State University, 2009-2013. It is licensed under
5 # the three-clause BSD license; see doc/LICENSE.txt.
6 # Contact: khmer-project@idyll.org
7 #
8 import sys
9 import screed.fasta
10 import os
11 import khmer
12 from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter
13
14 WORKER_THREADS = 8
15 GROUPSIZE = 100
16
17 CUTOFF = 50
18
19 ###
20
21
22 def main():
23 counting_ht = sys.argv[1]
24 infiles = sys.argv[2:]
25
26 print 'file with ht: %s' % counting_ht
27 print '-- settings:'
28 print 'N THREADS', WORKER_THREADS
29 print '--'
30
31 print 'making hashtable'
32 ht = khmer.load_counting_hash(counting_ht)
33 K = ht.ksize()
34
35 for infile in infiles:
36 print 'filtering', infile
37 outfile = os.path.basename(infile) + '.below'
38
39 outfp = open(outfile, 'w')
40
41 def process_fn(record, ht=ht):
42 name = record['name']
43 seq = record['sequence']
44 if 'N' in seq:
45 return None, None
46
47 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)
48
49 if trim_at >= K:
50 return name, trim_seq
51
52 return None, None
53
54 tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
55
56 tsp.start(verbose_fasta_iter(infile), outfp)
57
58 if __name__ == '__main__':
59 main()