annotate filter-below-abund.py @ 60:fe697e0cb24a draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
author crusoe
date Tue, 07 Jul 2015 11:59:39 -0400
parents 0b238b083f77
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
60
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
1 #! /usr/bin/env python
45
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
2 #
60
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
3 # This file is part of khmer, https://github.com/dib-lab/khmer/, and is
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
4 # Copyright (C) Michigan State University, 2009-2015. It is licensed under
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
5 # the three-clause BSD license; see LICENSE.
45
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
6 # Contact: khmer-project@idyll.org
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
7 #
60
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
8 from __future__ import print_function
45
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
9 import sys
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
10 import os
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
11 import khmer
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
12 from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
13
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
14 WORKER_THREADS = 8
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
15 GROUPSIZE = 100
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
16
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
17 CUTOFF = 50
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
18
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
19 ###
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
20
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
21
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
22 def main():
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
23 counting_ht = sys.argv[1]
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
24 infiles = sys.argv[2:]
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
25
60
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
26 print('file with ht: %s' % counting_ht)
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
27 print('-- settings:')
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
28 print('N THREADS', WORKER_THREADS)
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
29 print('--')
45
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
30
60
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
31 print('making hashtable')
45
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
32 ht = khmer.load_counting_hash(counting_ht)
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
33 K = ht.ksize()
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
34
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
35 for infile in infiles:
60
fe697e0cb24a planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit d8e0950d53e504e02ee5db43c0804142b14d7fd2-dirty
crusoe
parents: 45
diff changeset
36 print('filtering', infile)
45
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
37 outfile = os.path.basename(infile) + '.below'
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
38
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
39 outfp = open(outfile, 'w')
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
40
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
41 def process_fn(record, ht=ht):
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
42 name = record['name']
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
43 seq = record['sequence']
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
44 if 'N' in seq:
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
45 return None, None
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
46
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
47 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
48
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
49 if trim_at >= K:
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
50 return name, trim_seq
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
51
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
52 return None, None
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
53
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
54 tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
55
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
56 tsp.start(verbose_fasta_iter(infile), outfp)
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
57
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
58 if __name__ == '__main__':
0b238b083f77 2 more tools
Michael R. Crusoe <mcrusoe@msu.edu>
parents:
diff changeset
59 main()