changeset 45:0b238b083f77

2 more tools
author Michael R. Crusoe <mcrusoe@msu.edu>
date Sat, 12 Jul 2014 11:13:21 -0400
parents 46d13bbb21f2
children 471f3e085664
files README.txt count-median.xml do-partition.xml filter-abund.xml filter-below-abund.py filter-below-abund.xml macros.xml normalize-by-median.xml test-data/test-abund-read-2.fa.ct
diffstat 8 files changed, 241 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.txt	Sat Jul 12 11:13:21 2014 -0400
@@ -0,0 +1,5 @@
+Requires galaxy-central dating from 2014-06-30 or newer
+
+https://bitbucket.org/galaxy/galaxy-central/commits/4de240e5a7819c768b8267c19e477530dab54370
+
+
--- a/count-median.xml	Mon Jun 30 16:51:11 2014 -0400
+++ b/count-median.xml	Sat Jul 12 11:13:21 2014 -0400
@@ -37,7 +37,7 @@
 	</stdio>
 
 	<tests>
-		<test>
+		<test interactor="api">
                         <param name="input_sequence_filename" value="test-abund-read-2.fa" />
                         <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" />
                         <output name="output_summary_filename">
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/do-partition.xml	Sat Jul 12 11:13:21 2014 -0400
@@ -0,0 +1,93 @@
+<tool	id="gedlab-khmer-do-partition"
+	name="Sequence partition all-in-one"
+	version="1.1-1"
+	force_history_refresh="true">
+	
+	<description>
+		Load, partition, and annotate FAST[AQ] sequences
+	</description>
+        <macros>
+                <token name="@BINARY@">do-parition.py</token>
+                <import>macros.xml</import>
+        </macros>
+        <expand macro="requirements" />
+	<command>
+mkdir -p output;
+@BINARY@
+@TABLEPARAMS@
+@THREADS@
+output
+#for input in $inputs
+$input
+#end for ;
+mv output.info $infomation ;
+mv *.part output/
+	</command>
+
+	<inputs>
+		<expand macro="input_sequences_filenames" />
+	</inputs>
+	<outputs>
+		<data	name="information"
+			format="text"
+			label="${tool.name} summary for #echo ','.join(map(str, $inputs ))#" />
+		<expand macro="output_sequences" />
+	</outputs>
+ 	<stdio>
+        <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->
+		<exit_code	range="1:"
+				level="fatal" />
+	</stdio>
+	
+<!--	<tests>
+		<test interactor="api">
+			<conditional name="parameters">
+				<param name="type" value="specific" />
+				<param name="inputs" value="test-abund-read-2.fa"/>
+				<param name="cutoff" value="1" />
+				<param name="ksize" value="17" />
+			</conditional>
+			<output name="output">
+				<discover_dataset name="test-abund-read-2.fa.keep">
+					<assert_contents>
+						<has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
+					</assert_contents>
+				</discover_dataset>
+			</output>
+		</test>
+		<test interactor="api">
+			<param name="inputs" value="test-abund-read-2.fa" />
+			<param name="cutoff" value="2" />
+			<param name="ksize" value="17" />
+			<output name="output">
+				<discover_dataset name="test-abund-read-2.fa.keep">
+					<assert_contents>
+						<has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
+						<has_line_matching expression="GGTTGACGGGGCTCAGGG" />
+					</assert_contents>
+				</discover_dataset>
+			</output>
+		</test>
+		<test interactor="api">
+			<param name="inputs" value="test-abund-read-paired.fa" />
+			<param name="cutoff" value="1" />
+			<param name="ksize" value="17" />
+			<param name="paired" value="true" />
+			<output name="output">
+				<discover_dataset name="test-abund-read-paired.fa.keep">
+					<assert_contents>
+						<has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
+						<has_line_matching expression="GGTTGACGGGGCTCAGGG" />
+					</assert_contents>
+				</discover_dataset>
+			</output>
+		</test>
+
+	</tests>
+    -->
+    <!-- [OPTIONAL] Help displayed in Galaxy -->
+    <!--
+	<help>
+	</help>
+    -->    
+</tool>
--- a/filter-abund.xml	Mon Jun 30 16:51:11 2014 -0400
+++ b/filter-abund.xml	Sat Jul 12 11:13:21 2014 -0400
@@ -14,7 +14,7 @@
 	<expand macro="requirements" />
 	<command>
 mkdir output; cd output;
-filter-abund.py
+@BINARY@
 #if $cutoff != 2
   --cutoff=$cutoff
 #fi
@@ -43,7 +43,8 @@
 		<expand macro="input_counting_table_filename" />	
 	</inputs>
 	<outputs>
-		<expand macro="output_sequences" />
+		<!-- <expand macro="output_sequences" /> -->
+		<expand macro="output_sequences_single" />
 	</outputs>
  	<stdio>
         <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter-below-abund.py	Sat Jul 12 11:13:21 2014 -0400
@@ -0,0 +1,59 @@
+#! /usr/bin/env python2
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project@idyll.org
+#
+import sys
+import screed.fasta
+import os
+import khmer
+from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter
+
+WORKER_THREADS = 8
+GROUPSIZE = 100
+
+CUTOFF = 50
+
+###
+
+
+def main():
+    counting_ht = sys.argv[1]
+    infiles = sys.argv[2:]
+
+    print 'file with ht: %s' % counting_ht
+    print '-- settings:'
+    print 'N THREADS', WORKER_THREADS
+    print '--'
+
+    print 'making hashtable'
+    ht = khmer.load_counting_hash(counting_ht)
+    K = ht.ksize()
+
+    for infile in infiles:
+        print 'filtering', infile
+        outfile = os.path.basename(infile) + '.below'
+
+        outfp = open(outfile, 'w')
+
+        def process_fn(record, ht=ht):
+            name = record['name']
+            seq = record['sequence']
+            if 'N' in seq:
+                return None, None
+
+            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)
+
+            if trim_at >= K:
+                return name, trim_seq
+
+            return None, None
+
+        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
+
+        tsp.start(verbose_fasta_iter(infile), outfp)
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter-below-abund.xml	Sat Jul 12 11:13:21 2014 -0400
@@ -0,0 +1,64 @@
+<tool	id="gedlab-khmer-filter-below-abund"
+	name="Filter below abundance cutoff of 50"
+	version="1.1-1"
+	force_history_refresh="true">
+	
+	<description>
+		Trims fastq/fasta sequences at k-mers with abundance below 50
+		based on a provided k-mer counting table.
+	</description>
+	<macros>
+		<token name="@BINARY@">filter-below-abund.py</token>
+		<import>macros.xml</import>
+	</macros>
+	<expand macro="requirements" />
+	<command>
+mkdir output; cd output;
+@BINARY@
+$countingtable_to_load
+#for input in inputs
+ $input
+#end for
+	</command>
+
+	<inputs>
+		<expand macro="input_sequences_filenames" />
+		<expand macro="input_counting_table_filename" />	
+	</inputs>
+	<outputs>
+		<!-- <expand macro="output_sequences" /> -->
+		<expand macro="output_sequences_single" />
+	</outputs>
+ 	<stdio>
+        <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->
+		<exit_code	range="1:"
+				level="fatal" />
+	</stdio>
+	<!--        <tests>
+                <test interactor="api">
+                        <param name="inputs" value="test-abund-read-2.fa" />
+                        <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" />
+                        <output name="output">
+				<discover_dataset name="test-abund-read-2.fa.abundfilt">
+				</discover_dataset>
+                        </output>
+                </test>
+                <test interactor="api">
+                        <param name="input_sequence_filename" value="test-abund-read-2.fa" />
+                        <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" />
+			<param name="cutoff" value="1" />
+                        <output name="output">
+				<discover_dataset name="test-abund-read-2.fa.abundfilt">
+                                	<assert_contents>
+                                        	<has_text text="GGTTGACGGGGCTCAGGG" />
+                                	</assert_contents>
+				</discover_dataset>
+                        </output>
+                </test>
+	</tests> -->
+	<!-- [OPTIONAL] ReST Help displayed in Galaxy -->
+    <!--
+	<help>
+	</help>
+    -->    
+</tool>
--- a/macros.xml	Mon Jun 30 16:51:11 2014 -0400
+++ b/macros.xml	Sat Jul 12 11:13:21 2014 -0400
@@ -106,6 +106,11 @@
                         <discover_datasets pattern="__name__" directory="output" visible="true"/>
                 </data>
 	</xml>
+	<xml name="output_sequences_single">
+                <data   name="output sequences"
+                        format="input"
+                        label="${tool.name} processed nucleotide sequence file" />
+	</xml>
 	<xml name="input_zero">
                 <param  name="zero"
                         type="boolean"
--- a/normalize-by-median.xml	Mon Jun 30 16:51:11 2014 -0400
+++ b/normalize-by-median.xml	Sat Jul 12 11:13:21 2014 -0400
@@ -26,13 +26,18 @@
 --loadtable=$countingtable_to_load
 #end if
 --report-total-kmers
-#for input in $inputs
+#for entry in $many_inputs
+#for input in $entry.inputs
 $input
 #end for
+#end for
+--out=$output
 	</command>
 
 	<inputs>
-		<expand macro="input_sequences_filenames" />
+		<repeat name="many_inputs" title="input(s) set" min="1" default="1">
+			<expand macro="input_sequences_filenames" />
+		</repeat>
 		<param	name="paired_switch"
 			type="boolean"
 			checked="false"
@@ -43,6 +48,7 @@
 
 		<param	name="countingtable_to_load"
 			type="data"
+			format="ct"
 			optional="true"
 			label="an optional k-mer counting table to load"
 			help="The inputs file(s) will be processed using the kmer counts in the specified k-mer counting table file as a starting point." />
@@ -60,11 +66,12 @@
 	</inputs>
 	<outputs>
 		<data	name="countingtable"
-			format="data"
+			format="ct"
 			label="${tool.name} k-mer counting table from  #echo ', '.join(map(str, $inputs ))#">
 			<filter>save_countingtable == True</filter>
 		</data>
-		<expand macro="output_sequences" />
+		<!-- <expand macro="output_sequences" /> -->
+		<expand macro="output_sequences_single" />
 	</outputs>
  	<stdio>
         <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->