Mercurial > repos > crusoe > khmer
comparison filter-below-abund.py @ 45:0b238b083f77
2 more tools
| author | Michael R. Crusoe <mcrusoe@msu.edu> |
|---|---|
| date | Sat, 12 Jul 2014 11:13:21 -0400 |
| parents | |
| children | fe697e0cb24a |
comparison
equal
deleted
inserted
replaced
| 44:46d13bbb21f2 | 45:0b238b083f77 |
|---|---|
| 1 #! /usr/bin/env python2 | |
| 2 # | |
| 3 # This file is part of khmer, http://github.com/ged-lab/khmer/, and is | |
| 4 # Copyright (C) Michigan State University, 2009-2013. It is licensed under | |
| 5 # the three-clause BSD license; see doc/LICENSE.txt. | |
| 6 # Contact: khmer-project@idyll.org | |
| 7 # | |
| 8 import sys | |
| 9 import screed.fasta | |
| 10 import os | |
| 11 import khmer | |
| 12 from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter | |
| 13 | |
| 14 WORKER_THREADS = 8 | |
| 15 GROUPSIZE = 100 | |
| 16 | |
| 17 CUTOFF = 50 | |
| 18 | |
| 19 ### | |
| 20 | |
| 21 | |
| 22 def main(): | |
| 23 counting_ht = sys.argv[1] | |
| 24 infiles = sys.argv[2:] | |
| 25 | |
| 26 print 'file with ht: %s' % counting_ht | |
| 27 print '-- settings:' | |
| 28 print 'N THREADS', WORKER_THREADS | |
| 29 print '--' | |
| 30 | |
| 31 print 'making hashtable' | |
| 32 ht = khmer.load_counting_hash(counting_ht) | |
| 33 K = ht.ksize() | |
| 34 | |
| 35 for infile in infiles: | |
| 36 print 'filtering', infile | |
| 37 outfile = os.path.basename(infile) + '.below' | |
| 38 | |
| 39 outfp = open(outfile, 'w') | |
| 40 | |
| 41 def process_fn(record, ht=ht): | |
| 42 name = record['name'] | |
| 43 seq = record['sequence'] | |
| 44 if 'N' in seq: | |
| 45 return None, None | |
| 46 | |
| 47 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) | |
| 48 | |
| 49 if trim_at >= K: | |
| 50 return name, trim_seq | |
| 51 | |
| 52 return None, None | |
| 53 | |
| 54 tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) | |
| 55 | |
| 56 tsp.start(verbose_fasta_iter(infile), outfp) | |
| 57 | |
| 58 if __name__ == '__main__': | |
| 59 main() |
