Repository 'khmer'
hg clone https://testtoolshed.g2.bx.psu.edu/repos/crusoe/khmer

Changeset 45:0b238b083f77 (2014-07-12)
Previous changeset 44:46d13bbb21f2 (2014-06-30) Next changeset 46:471f3e085664 (2014-07-12)
Commit message:
2 more tools
modified:
count-median.xml
filter-abund.xml
macros.xml
normalize-by-median.xml
added:
README.txt
do-partition.xml
filter-below-abund.py
filter-below-abund.xml
test-data/test-abund-read-2.fa.ct
b
diff -r 46d13bbb21f2 -r 0b238b083f77 README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.txt Sat Jul 12 11:13:21 2014 -0400
b
@@ -0,0 +1,5 @@
+Requires galaxy-central dating from 2014-06-30 or newer
+
+https://bitbucket.org/galaxy/galaxy-central/commits/4de240e5a7819c768b8267c19e477530dab54370
+
+
b
diff -r 46d13bbb21f2 -r 0b238b083f77 count-median.xml
--- a/count-median.xml Mon Jun 30 16:51:11 2014 -0400
+++ b/count-median.xml Sat Jul 12 11:13:21 2014 -0400
b
@@ -37,7 +37,7 @@
  </stdio>
 
  <tests>
- <test>
+ <test interactor="api">
                         <param name="input_sequence_filename" value="test-abund-read-2.fa" />
                         <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" />
                         <output name="output_summary_filename">
b
diff -r 46d13bbb21f2 -r 0b238b083f77 do-partition.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/do-partition.xml Sat Jul 12 11:13:21 2014 -0400
[
@@ -0,0 +1,93 @@
+<tool id="gedlab-khmer-do-partition"
+ name="Sequence partition all-in-one"
+ version="1.1-1"
+ force_history_refresh="true">
+
+ <description>
+ Load, partition, and annotate FAST[AQ] sequences
+ </description>
+        <macros>
+                <token name="@BINARY@">do-parition.py</token>
+                <import>macros.xml</import>
+        </macros>
+        <expand macro="requirements" />
+ <command>
+mkdir -p output;
+@BINARY@
+@TABLEPARAMS@
+@THREADS@
+output
+#for input in $inputs
+$input
+#end for ;
+mv output.info $infomation ;
+mv *.part output/
+ </command>
+
+ <inputs>
+ <expand macro="input_sequences_filenames" />
+ </inputs>
+ <outputs>
+ <data name="information"
+ format="text"
+ label="${tool.name} summary for #echo ','.join(map(str, $inputs ))#" />
+ <expand macro="output_sequences" />
+ </outputs>
+  <stdio>
+        <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->
+ <exit_code range="1:"
+ level="fatal" />
+ </stdio>
+
+<!-- <tests>
+ <test interactor="api">
+ <conditional name="parameters">
+ <param name="type" value="specific" />
+ <param name="inputs" value="test-abund-read-2.fa"/>
+ <param name="cutoff" value="1" />
+ <param name="ksize" value="17" />
+ </conditional>
+ <output name="output">
+ <discover_dataset name="test-abund-read-2.fa.keep">
+ <assert_contents>
+ <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
+ </assert_contents>
+ </discover_dataset>
+ </output>
+ </test>
+ <test interactor="api">
+ <param name="inputs" value="test-abund-read-2.fa" />
+ <param name="cutoff" value="2" />
+ <param name="ksize" value="17" />
+ <output name="output">
+ <discover_dataset name="test-abund-read-2.fa.keep">
+ <assert_contents>
+ <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
+ <has_line_matching expression="GGTTGACGGGGCTCAGGG" />
+ </assert_contents>
+ </discover_dataset>
+ </output>
+ </test>
+ <test interactor="api">
+ <param name="inputs" value="test-abund-read-paired.fa" />
+ <param name="cutoff" value="1" />
+ <param name="ksize" value="17" />
+ <param name="paired" value="true" />
+ <output name="output">
+ <discover_dataset name="test-abund-read-paired.fa.keep">
+ <assert_contents>
+ <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
+ <has_line_matching expression="GGTTGACGGGGCTCAGGG" />
+ </assert_contents>
+ </discover_dataset>
+ </output>
+ </test>
+
+ </tests>
+    -->
+    <!-- [OPTIONAL] Help displayed in Galaxy -->
+    <!--
+ <help>
+ </help>
+    -->    
+</tool>
b
diff -r 46d13bbb21f2 -r 0b238b083f77 filter-abund.xml
--- a/filter-abund.xml Mon Jun 30 16:51:11 2014 -0400
+++ b/filter-abund.xml Sat Jul 12 11:13:21 2014 -0400
[
@@ -14,7 +14,7 @@
  <expand macro="requirements" />
  <command>
 mkdir output; cd output;
-filter-abund.py
+@BINARY@
 #if $cutoff != 2
   --cutoff=$cutoff
 #fi
@@ -43,7 +43,8 @@
  <expand macro="input_counting_table_filename" />
  </inputs>
  <outputs>
- <expand macro="output_sequences" />
+ <!-- <expand macro="output_sequences" /> -->
+ <expand macro="output_sequences_single" />
  </outputs>
   <stdio>
         <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->
b
diff -r 46d13bbb21f2 -r 0b238b083f77 filter-below-abund.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter-below-abund.py Sat Jul 12 11:13:21 2014 -0400
[
@@ -0,0 +1,59 @@
+#! /usr/bin/env python2
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project@idyll.org
+#
+import sys
+import screed.fasta
+import os
+import khmer
+from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter
+
+WORKER_THREADS = 8
+GROUPSIZE = 100
+
+CUTOFF = 50
+
+###
+
+
+def main():
+    counting_ht = sys.argv[1]
+    infiles = sys.argv[2:]
+
+    print 'file with ht: %s' % counting_ht
+    print '-- settings:'
+    print 'N THREADS', WORKER_THREADS
+    print '--'
+
+    print 'making hashtable'
+    ht = khmer.load_counting_hash(counting_ht)
+    K = ht.ksize()
+
+    for infile in infiles:
+        print 'filtering', infile
+        outfile = os.path.basename(infile) + '.below'
+
+        outfp = open(outfile, 'w')
+
+        def process_fn(record, ht=ht):
+            name = record['name']
+            seq = record['sequence']
+            if 'N' in seq:
+                return None, None
+
+            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)
+
+            if trim_at >= K:
+                return name, trim_seq
+
+            return None, None
+
+        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
+
+        tsp.start(verbose_fasta_iter(infile), outfp)
+
+if __name__ == '__main__':
+    main()
b
diff -r 46d13bbb21f2 -r 0b238b083f77 filter-below-abund.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter-below-abund.xml Sat Jul 12 11:13:21 2014 -0400
[
@@ -0,0 +1,64 @@
+<tool id="gedlab-khmer-filter-below-abund"
+ name="Filter below abundance cutoff of 50"
+ version="1.1-1"
+ force_history_refresh="true">
+
+ <description>
+ Trims fastq/fasta sequences at k-mers with abundance below 50
+ based on a provided k-mer counting table.
+ </description>
+ <macros>
+ <token name="@BINARY@">filter-below-abund.py</token>
+ <import>macros.xml</import>
+ </macros>
+ <expand macro="requirements" />
+ <command>
+mkdir output; cd output;
+@BINARY@
+$countingtable_to_load
+#for input in inputs
+ $input
+#end for
+ </command>
+
+ <inputs>
+ <expand macro="input_sequences_filenames" />
+ <expand macro="input_counting_table_filename" />
+ </inputs>
+ <outputs>
+ <!-- <expand macro="output_sequences" /> -->
+ <expand macro="output_sequences_single" />
+ </outputs>
+  <stdio>
+        <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->
+ <exit_code range="1:"
+ level="fatal" />
+ </stdio>
+ <!--        <tests>
+                <test interactor="api">
+                        <param name="inputs" value="test-abund-read-2.fa" />
+                        <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" />
+                        <output name="output">
+ <discover_dataset name="test-abund-read-2.fa.abundfilt">
+ </discover_dataset>
+                        </output>
+                </test>
+                <test interactor="api">
+                        <param name="input_sequence_filename" value="test-abund-read-2.fa" />
+                        <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" />
+ <param name="cutoff" value="1" />
+                        <output name="output">
+ <discover_dataset name="test-abund-read-2.fa.abundfilt">
+                                 <assert_contents>
+                                         <has_text text="GGTTGACGGGGCTCAGGG" />
+                                 </assert_contents>
+ </discover_dataset>
+                        </output>
+                </test>
+ </tests> -->
+ <!-- [OPTIONAL] ReST Help displayed in Galaxy -->
+    <!--
+ <help>
+ </help>
+    -->    
+</tool>
b
diff -r 46d13bbb21f2 -r 0b238b083f77 macros.xml
--- a/macros.xml Mon Jun 30 16:51:11 2014 -0400
+++ b/macros.xml Sat Jul 12 11:13:21 2014 -0400
b
@@ -106,6 +106,11 @@
                         <discover_datasets pattern="__name__" directory="output" visible="true"/>
                 </data>
  </xml>
+ <xml name="output_sequences_single">
+                <data   name="output sequences"
+                        format="input"
+                        label="${tool.name} processed nucleotide sequence file" />
+ </xml>
  <xml name="input_zero">
                 <param  name="zero"
                         type="boolean"
b
diff -r 46d13bbb21f2 -r 0b238b083f77 normalize-by-median.xml
--- a/normalize-by-median.xml Mon Jun 30 16:51:11 2014 -0400
+++ b/normalize-by-median.xml Sat Jul 12 11:13:21 2014 -0400
[
@@ -26,13 +26,18 @@
 --loadtable=$countingtable_to_load
 #end if
 --report-total-kmers
-#for input in $inputs
+#for entry in $many_inputs
+#for input in $entry.inputs
 $input
 #end for
+#end for
+--out=$output
  </command>
 
  <inputs>
- <expand macro="input_sequences_filenames" />
+ <repeat name="many_inputs" title="input(s) set" min="1" default="1">
+ <expand macro="input_sequences_filenames" />
+ </repeat>
  <param name="paired_switch"
  type="boolean"
  checked="false"
@@ -43,6 +48,7 @@
 
  <param name="countingtable_to_load"
  type="data"
+ format="ct"
  optional="true"
  label="an optional k-mer counting table to load"
  help="The inputs file(s) will be processed using the kmer counts in the specified k-mer counting table file as a starting point." />
@@ -60,11 +66,12 @@
  </inputs>
  <outputs>
  <data name="countingtable"
- format="data"
+ format="ct"
  label="${tool.name} k-mer counting table from  #echo ', '.join(map(str, $inputs ))#">
  <filter>save_countingtable == True</filter>
  </data>
- <expand macro="output_sequences" />
+ <!-- <expand macro="output_sequences" /> -->
+ <expand macro="output_sequences_single" />
  </outputs>
   <stdio>
         <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->