annotate filter-below-abund.py @ 59:08a599cf71d0

Bjoern's suggestions
author Michael R. Crusoe <mcrusoe@msu.edu>
date Mon, 18 Aug 2014 07:02:05 -0400
parents 0b238b083f77
children fe697e0cb24a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
45
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
1 #! /usr/bin/env python2
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
2 #
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
3 # This file is part of khmer, http://github.com/ged-lab/khmer/, and is
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
4 # Copyright (C) Michigan State University, 2009-2013. It is licensed under
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
5 # the three-clause BSD license; see doc/LICENSE.txt.
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
6 # Contact: khmer-project@idyll.org
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
7 #
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
8 import sys
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
9 import screed.fasta
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
10 import os
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
11 import khmer
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
12 from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
13
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
14 WORKER_THREADS = 8
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
15 GROUPSIZE = 100
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
16
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
17 CUTOFF = 50
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
18
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
19 ###
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
20
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
21
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
22 def main():
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
23 counting_ht = sys.argv[1]
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
24 infiles = sys.argv[2:]
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
25
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
26 print 'file with ht: %s' % counting_ht
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
27 print '-- settings:'
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
28 print 'N THREADS', WORKER_THREADS
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
29 print '--'
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
30
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
31 print 'making hashtable'
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
32 ht = khmer.load_counting_hash(counting_ht)
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
33 K = ht.ksize()
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
34
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
35 for infile in infiles:
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
36 print 'filtering', infile
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
37 outfile = os.path.basename(infile) + '.below'
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
38
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
39 outfp = open(outfile, 'w')
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
40
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
41 def process_fn(record, ht=ht):
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
42 name = record['name']
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
43 seq = record['sequence']
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
44 if 'N' in seq:
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
45 return None, None
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
46
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
47 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
48
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
49 if trim_at >= K:
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
50 return name, trim_seq
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
51
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
52 return None, None
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
53
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
54 tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
55
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
56 tsp.start(verbose_fasta_iter(infile), outfp)
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
57
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
58 if __name__ == '__main__':
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
59 main()