Mercurial > repos > crusoe > khmer
changeset 45:0b238b083f77
2 more tools
author | Michael R. Crusoe <mcrusoe@msu.edu> |
---|---|
date | Sat, 12 Jul 2014 11:13:21 -0400 |
parents | 46d13bbb21f2 |
children | 471f3e085664 |
files | README.txt count-median.xml do-partition.xml filter-abund.xml filter-below-abund.py filter-below-abund.xml macros.xml normalize-by-median.xml test-data/test-abund-read-2.fa.ct |
diffstat | 8 files changed, 241 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Sat Jul 12 11:13:21 2014 -0400 @@ -0,0 +1,5 @@ +Requires galaxy-central dating from 2014-06-30 or newer + +https://bitbucket.org/galaxy/galaxy-central/commits/4de240e5a7819c768b8267c19e477530dab54370 + +
--- a/count-median.xml Mon Jun 30 16:51:11 2014 -0400 +++ b/count-median.xml Sat Jul 12 11:13:21 2014 -0400 @@ -37,7 +37,7 @@ </stdio> <tests> - <test> + <test interactor="api"> <param name="input_sequence_filename" value="test-abund-read-2.fa" /> <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" /> <output name="output_summary_filename">
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/do-partition.xml Sat Jul 12 11:13:21 2014 -0400 @@ -0,0 +1,93 @@ +<tool id="gedlab-khmer-do-partition" + name="Sequence partition all-in-one" + version="1.1-1" + force_history_refresh="true"> + + <description> + Load, partition, and annotate FAST[AQ] sequences + </description> + <macros> + <token name="@BINARY@">do-parition.py</token> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command> +mkdir -p output; +@BINARY@ +@TABLEPARAMS@ +@THREADS@ +output +#for input in $inputs +$input +#end for ; +mv output.info $infomation ; +mv *.part output/ + </command> + + <inputs> + <expand macro="input_sequences_filenames" /> + </inputs> + <outputs> + <data name="information" + format="text" + label="${tool.name} summary for #echo ','.join(map(str, $inputs ))#" /> + <expand macro="output_sequences" /> + </outputs> + <stdio> + <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR --> + <exit_code range="1:" + level="fatal" /> + </stdio> + +<!-- <tests> + <test interactor="api"> + <conditional name="parameters"> + <param name="type" value="specific" /> + <param name="inputs" value="test-abund-read-2.fa"/> + <param name="cutoff" value="1" /> + <param name="ksize" value="17" /> + </conditional> + <output name="output"> + <discover_dataset name="test-abund-read-2.fa.keep"> + <assert_contents> + <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" /> + </assert_contents> + </discover_dataset> + </output> + </test> + <test interactor="api"> + <param name="inputs" value="test-abund-read-2.fa" /> + <param name="cutoff" value="2" /> + <param name="ksize" value="17" /> + <output name="output"> + <discover_dataset name="test-abund-read-2.fa.keep"> + <assert_contents> + <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" /> + <has_line_matching expression="GGTTGACGGGGCTCAGGG" /> + </assert_contents> + </discover_dataset> + </output> + </test> + <test interactor="api"> + <param name="inputs" value="test-abund-read-paired.fa" /> + <param name="cutoff" value="1" /> + <param name="ksize" value="17" /> + <param name="paired" value="true" /> + <output name="output"> + <discover_dataset name="test-abund-read-paired.fa.keep"> + <assert_contents> + <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" /> + <has_line_matching expression="GGTTGACGGGGCTCAGGG" /> + </assert_contents> + </discover_dataset> + </output> + </test> + + </tests> + --> + <!-- [OPTIONAL] Help displayed in Galaxy --> + <!-- + <help> + </help> + --> +</tool>
--- a/filter-abund.xml Mon Jun 30 16:51:11 2014 -0400 +++ b/filter-abund.xml Sat Jul 12 11:13:21 2014 -0400 @@ -14,7 +14,7 @@ <expand macro="requirements" /> <command> mkdir output; cd output; -filter-abund.py +@BINARY@ #if $cutoff != 2 --cutoff=$cutoff #fi @@ -43,7 +43,8 @@ <expand macro="input_counting_table_filename" /> </inputs> <outputs> - <expand macro="output_sequences" /> + <!-- <expand macro="output_sequences" /> --> + <expand macro="output_sequences_single" /> </outputs> <stdio> <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter-below-abund.py Sat Jul 12 11:13:21 2014 -0400 @@ -0,0 +1,59 @@ +#! /usr/bin/env python2 +# +# This file is part of khmer, http://github.com/ged-lab/khmer/, and is +# Copyright (C) Michigan State University, 2009-2013. It is licensed under +# the three-clause BSD license; see doc/LICENSE.txt. +# Contact: khmer-project@idyll.org +# +import sys +import screed.fasta +import os +import khmer +from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter + +WORKER_THREADS = 8 +GROUPSIZE = 100 + +CUTOFF = 50 + +### + + +def main(): + counting_ht = sys.argv[1] + infiles = sys.argv[2:] + + print 'file with ht: %s' % counting_ht + print '-- settings:' + print 'N THREADS', WORKER_THREADS + print '--' + + print 'making hashtable' + ht = khmer.load_counting_hash(counting_ht) + K = ht.ksize() + + for infile in infiles: + print 'filtering', infile + outfile = os.path.basename(infile) + '.below' + + outfp = open(outfile, 'w') + + def process_fn(record, ht=ht): + name = record['name'] + seq = record['sequence'] + if 'N' in seq: + return None, None + + trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) + + if trim_at >= K: + return name, trim_seq + + return None, None + + tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) + + tsp.start(verbose_fasta_iter(infile), outfp) + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter-below-abund.xml Sat Jul 12 11:13:21 2014 -0400 @@ -0,0 +1,64 @@ +<tool id="gedlab-khmer-filter-below-abund" + name="Filter below abundance cutoff of 50" + version="1.1-1" + force_history_refresh="true"> + + <description> + Trims fastq/fasta sequences at k-mers with abundance below 50 + based on a provided k-mer counting table. + </description> + <macros> + <token name="@BINARY@">filter-below-abund.py</token> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command> +mkdir output; cd output; +@BINARY@ +$countingtable_to_load +#for input in inputs + $input +#end for + </command> + + <inputs> + <expand macro="input_sequences_filenames" /> + <expand macro="input_counting_table_filename" /> + </inputs> + <outputs> + <!-- <expand macro="output_sequences" /> --> + <expand macro="output_sequences_single" /> + </outputs> + <stdio> + <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR --> + <exit_code range="1:" + level="fatal" /> + </stdio> + <!-- <tests> + <test interactor="api"> + <param name="inputs" value="test-abund-read-2.fa" /> + <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" /> + <output name="output"> + <discover_dataset name="test-abund-read-2.fa.abundfilt"> + </discover_dataset> + </output> + </test> + <test interactor="api"> + <param name="input_sequence_filename" value="test-abund-read-2.fa" /> + <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" /> + <param name="cutoff" value="1" /> + <output name="output"> + <discover_dataset name="test-abund-read-2.fa.abundfilt"> + <assert_contents> + <has_text text="GGTTGACGGGGCTCAGGG" /> + </assert_contents> + </discover_dataset> + </output> + </test> + </tests> --> + <!-- [OPTIONAL] ReST Help displayed in Galaxy --> + <!-- + <help> + </help> + --> +</tool>
--- a/macros.xml Mon Jun 30 16:51:11 2014 -0400 +++ b/macros.xml Sat Jul 12 11:13:21 2014 -0400 @@ -106,6 +106,11 @@ <discover_datasets pattern="__name__" directory="output" visible="true"/> </data> </xml> + <xml name="output_sequences_single"> + <data name="output sequences" + format="input" + label="${tool.name} processed nucleotide sequence file" /> + </xml> <xml name="input_zero"> <param name="zero" type="boolean"
--- a/normalize-by-median.xml Mon Jun 30 16:51:11 2014 -0400 +++ b/normalize-by-median.xml Sat Jul 12 11:13:21 2014 -0400 @@ -26,13 +26,18 @@ --loadtable=$countingtable_to_load #end if --report-total-kmers -#for input in $inputs +#for entry in $many_inputs +#for input in $entry.inputs $input #end for +#end for +--out=$output </command> <inputs> - <expand macro="input_sequences_filenames" /> + <repeat name="many_inputs" title="input(s) set" min="1" default="1"> + <expand macro="input_sequences_filenames" /> + </repeat> <param name="paired_switch" type="boolean" checked="false" @@ -43,6 +48,7 @@ <param name="countingtable_to_load" type="data" + format="ct" optional="true" label="an optional k-mer counting table to load" help="The inputs file(s) will be processed using the kmer counts in the specified k-mer counting table file as a starting point." /> @@ -60,11 +66,12 @@ </inputs> <outputs> <data name="countingtable" - format="data" + format="ct" label="${tool.name} k-mer counting table from #echo ', '.join(map(str, $inputs ))#"> <filter>save_countingtable == True</filter> </data> - <expand macro="output_sequences" /> + <!-- <expand macro="output_sequences" /> --> + <expand macro="output_sequences_single" /> </outputs> <stdio> <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->