changeset 2:5eb99d21ef0d

Add trinityrnaseq_norm and transcriptsToOrfs tools
author Jim Johnson <jj@umn.edu>
date Thu, 05 Sep 2013 08:08:21 -0500
parents a34ce2b18877
children a9d882069cd4
files tool-data/pfam_db.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml transcriptsToOrfs.xml trinityrnaseq.xml trinityrnaseq_norm.xml
diffstat 6 files changed, 309 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/pfam_db.loc.sample	Thu Sep 05 08:08:21 2013 -0500
@@ -0,0 +1,3 @@
+#release_name	release	name	path
+#Pfam27.0_Pfam-A	Pfam27.0	Pfam-A	/data/pfam/Pfam-A.hmm
+#Pfam27.0_Pfam-B	Pfam27.0	Pfam-B	/data/pfam/Pfam-B.hmm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu Sep 05 08:08:21 2013 -0500
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="pfam_databases" comment_char="#">
+        <columns>value, release, name, path</columns>
+        <file path="tool-data/pfam_db.loc" />
+    </table>
+</tables>
--- a/tool_dependencies.xml	Fri Aug 30 10:56:26 2013 -0500
+++ b/tool_dependencies.xml	Thu Sep 05 08:08:21 2013 -0500
@@ -9,5 +9,8 @@
     <package name="bowtie" version="1.0.0">
         <repository changeset_revision="e682af6a72cd" name="package_bowtie_1_0_0" owner="jjohnson" toolshed="http://testtoolshed.g2.bx.psu.edu" />
     </package>
+    <package name="hmmer" version="3.0">
+        <repository changeset_revision="3bc37773c609" name="package_hmmer_3_0" owner="jjohnson" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    </package>
 </tool_dependency>
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/transcriptsToOrfs.xml	Thu Sep 05 08:08:21 2013 -0500
@@ -0,0 +1,117 @@
+<tool id="transcriptsToOrfs" name="transcriptsToOrfs" version="0.0.2">
+    <description>Trinity Transcripts to Candidate Peptides</description>
+    <requirements>
+        <requirement type="package" version="2013_08_14">trinityrnaseq</requirement>
+        <requirement type="package" version="3.0">hmmer</requirement>
+    </requirements>
+    <command>
+        \$TRINITY_HOME/trinity-plugins/transdecoder/transcripts_to_best_scoring_ORFs.pl
+        -t $transcripts 
+        #if $min_prot_length:
+            -m $min_prot_length 
+        #end if 
+        #if $retain_long_orfs:
+            --retain_long_orfs $retain_long_orfs 
+        #end if 
+        #if $training_count:
+            -T $training_count 
+        #end if 
+        #if str($strand_specificity) == 'SS':
+            -S
+        #end if
+        #if $genetic_code.__str__ != '':
+            -G $genetic_code
+        #end if
+        #if $search.use_pfam == 'yes':
+          --search_pfam "${ filter( lambda x: str( x[0] ) == str( $search.pfam_db ), $__app__.tool_data_tables[ 'pfam_databases' ].get_fields() )[0][-1] }"
+          --CPU $search.CPU
+        #end if
+    </command>
+    <inputs>
+        <param format="fasta" name="transcripts" type="data" label="Transcripts sequences in fastA format" help="" />
+        <param name="min_prot_length"  type="integer" value="" optional="true" label="Minimum peptide length (in amino acids)" help="default: 100">
+            <validator type="in_range" message="Minimum peptide length should be at least 50" min="50" />
+        </param>
+        <param name="retain_long_orfs"  type="integer" value="" optional="true" label="Retain all ORFs found that are of minimum length in nucleotides" help="default: 900" >
+            <validator type="in_range" message="ORF length should be at least 50" min="50" />
+        </param>
+        <param name="training_count"  type="integer" value="" optional="true" label="Number of top longest ORFs to train Markov Model (hexamer stats)" help="default: 500" >
+            <validator type="in_range" message="ORF count should be at least 50" min="50" />
+        </param>
+        <param name="strand_specificity" type="select" label="Strand specificity type">
+            <option value="DS">NOT strand specific, examine both strands</option>
+            <option value="SS">Strand specific, examine only top strand</option>
+        </param>
+        <param name="genetic_code" type="select" label="Genetic Code">
+            <option value="">use default(universal)</option>
+            <option value="universal">universal</option>
+            <option value="Euplotes">Euplotes</option>
+            <option value="Tetrahymena">Tetrahymena</option>
+            <option value="Candida">Candida</option>
+            <option value="Acetabularia">Acetabularia</option>
+        </param>
+        <conditional name="search">
+            <param name="use_pfam" type="select" label="Search PFAM database">
+                <option value="no">NO</option>
+                <option value="yes">YES</option>
+            </param>
+            <when value="no"/>
+            <when value="yes">
+                <param name="pfam_db" type="select" label="Pfam database">
+                    <options from_data_table="pfam_databases" />
+                </param>
+                <param name="CPU" type="integer" value="2" min="1" label="CPU" help="Number of CPUs to use by hmmscan" />
+            </when>
+        </conditional>
+    </inputs>
+    <stdio>
+        <exit_code range="1:"  level="fatal" description="Failed" />
+        <regex match="Error" 
+               source="stderr" 
+               level="fatal" 
+               description="Failed" />
+    </stdio>
+    <outputs>
+        <data format="txt" name="trinity_pep_pfam" label="${tool.name} on ${on_string}: Pfam matches to Candidate Peptide Sequences" from_work_dir="longest_orfs.pep.pfam.dat">
+          <filter>search['use_pfam'] == 'yes'</filter>
+        </data>
+        <data format="gff3" name="trinity_pep_gff3" label="${tool.name} on ${on_string} Candidate Peptide Features" from_work_dir="best_candidates.eclipsed_orfs_removed.gff3" />
+        <data format="bed" name="trinity_pep_bed" label="${tool.name} on ${on_string} Candidate Peptide Coordinates" from_work_dir="best_candidates.eclipsed_orfs_removed.bed" />
+        <data format="fasta" name="trinity_pep_cds" label="${tool.name} on ${on_string}: Candidate Peptide CDS Sequences" from_work_dir="best_candidates.eclipsed_orfs_removed.cds"/>
+        <data format="fasta" name="trinity_pep_seqs" label="${tool.name} on ${on_string}: Candidate Peptide Sequences" from_work_dir="best_candidates.eclipsed_orfs_removed.pep"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="transcripts" ftype="fasta" value="TrinitySingle.fasta"/>
+            <param name="min_prot_length" value="100"/>
+            <param name="use_pfam" value="no"/>
+            <output name="trinity_pep_seqs">
+                <assert_contents>
+                    <has_text text="WAAKAWLITARSLYPADF" />
+                </assert_contents>
+            </output>
+            <output name="trinity_pep_cds">
+                <assert_contents>
+                    <has_text text="TGGGCAGCCAAGGCATGGCTGATCACGGCCCGCA" />
+                </assert_contents>
+            </output>
+            <output name="trinity_pep_bed">
+                <assert_contents>
+                    <has_text text="comp10_c0_seq1" />
+                </assert_contents>
+            </output>
+            <output name="trinity_pep_gff3">
+                <assert_contents>
+                    <has_text text="comp10_c0_seq1" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+        ** transcriptsToOrfs ** 
+        Trinity_ is a de novo transcript assembler that uses RNA-seq data as input. 
+        This tool searches for open reading frames in the assembled transcripts.
+        
+        .. _Trinity: http://trinityrnaseq.sourceforge.net
+    </help>
+</tool>
--- a/trinityrnaseq.xml	Fri Aug 30 10:56:26 2013 -0500
+++ b/trinityrnaseq.xml	Thu Sep 05 08:08:21 2013 -0500
@@ -110,7 +110,7 @@
         </conditional>
     </inputs>
     <stdio>
-        <exit_code range="1:"  level="fatal"   description="Faiiled" />
+        <exit_code range="1:"  level="fatal" description="Failed" />
         <regex match="command not found" 
                source="both" 
                level="fatal" 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trinityrnaseq_norm.xml	Thu Sep 05 08:08:21 2013 -0500
@@ -0,0 +1,178 @@
+<tool id="trinityrnaseq_norm" name="Trinity read normalization" version="0.0.2">
+    <!-- Written by Jeremy Goecks, modified by Josh Bowden for normalization proceedure, now maintained here by bhaas -->
+    <description>Pre-process RNA-seq data to reduce coverage of highly covered areas</description>
+    <requirements>
+        <requirement type="package" version="2013_08_14">trinityrnaseq</requirement>
+    </requirements>
+    <command>
+        ## symlink input in work_dir
+        #if str($inputs.paired_or_single) == "paired":
+          ln -s $inputs.left_input left_reads &amp;&amp;
+          ln -s $inputs.right_input right_reads &amp;&amp;
+        #else:
+          ln -s $inputs.input single_reads &amp;&amp;
+        #end if	
+        \${TRINITY_HOME}/util/normalize_by_kmer_coverage.pl --JM $JM --max_cov $max_cov
+        ## Inputs.
+        #if str($inputs.paired_or_single) == "paired":
+            --left left_reads --right right_reads
+            #if  $inputs.left_input.ext == 'fa':
+                --seqType fa
+            #else:
+                --seqType fq
+            #end if
+            $inputs.pe_reads_unordered
+            #if str($inputs.library_type) != "None":
+                --SS_lib_type $inputs.library_type
+            #end if
+            $inputs.pairs_together
+            $inputs.parallel_stats
+        #else:
+            --single single_reads
+            #if  str($inputs.input.ext) == 'fa':
+                --seqType fa
+            #else:
+                --seqType fq
+            #end if
+            #if str($inputs.library_type) != "None":
+                --SS_lib_type $inputs.library_type
+            #end if
+        #end if	
+        #if $kmer_size:
+            --KMER_SIZE $kmer_size
+        #end if
+        #if $max_pct_stdev:
+            --max_pct_stdev $max_pct_stdev
+        #end if
+        ## direct stdio to output
+        | tee  $trinity_coverage_normalization_log &amp;&amp;
+        #if str($inputs.paired_or_single) == "paired":
+          cp left_reads.normalized* $output_left &amp;&amp;
+          cp right_reads.normalized* $output_right 
+        #else:
+          cp single_reads.normalized* $output_single 
+        #end if	
+    </command>
+    <inputs>
+      	<param name="JM" type="select" label="JM" help="Amount of memory to allocate to Jellyfish for Kmer catalog construction">
+		<option value="1G">1G</option>
+		<option value="10G">10G</option>
+		<option value="20G">20G</option>
+		<option value="50G">50G</option>
+		<option value="100G">100G</option>			
+	</param>
+		
+	<param name="max_cov" type="select" label="max_cov" help="Read coverage in terms of maximum covarge to keep">
+		<option value="30">30</option>
+		<option value="40">40</option>
+		<option value="50">50</option>
+		<option value="60">60</option>
+		<option value="70">70</option>
+		<option value="100">100</option>
+	</param>
+
+        <conditional name="inputs">
+	    <param name="paired_or_single" type="select" label="Paired or Single-end data?">
+                <option value="paired">Paired</option>
+                <option value="single">Single</option>
+            </param>
+            <when value="paired">
+                <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
+                <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
+                <param name="library_type" type="select" label="Strand-specific Library Type">
+                    <option value="None">None</option>
+                    <option value="FR">FR</option>
+                    <option value="RF">RF</option>
+                </param>  				
+                <param name="pe_reads_unordered" type="boolean" truevalue="--PE_reads_unordered" falsevalue="" checked="false" label="set if the input paired-end reads are not identically ordered"/>
+                <param name="pairs_together" type="boolean" truevalue="--pairs_together" falsevalue="" checked="false" label="process paired reads by averaging stats between pairs and retaining linking info"/>
+                <param name="parallel_stats" type="boolean" truevalue="--PARALLEL_STATS" falsevalue="" checked="false" label="generate read stats in parallel for paired reads" help="(Figure 2X Inchworm memory requirement)"/>
+            </when>
+            <when value="single">
+                <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
+                <param name="library_type" type="select" label="Strand-specific Library Type">
+                    <option value="None">None</option>
+                    <option value="F">F</option>
+                    <option value="R">R</option>
+                </param>
+            </when>
+        </conditional>
+        <param name="kmer_size"  type="integer" value="" optional="true" label="KMER SIZE" help="default: 25">
+            <validator type="in_range" message="kmer size between 3 and 200" min="3" max="200"/>
+        </param>
+        <param name="max_pct_stdev"  type="integer" value="" optional="true" label="maximum pct of mean for stdev of kmer coverage across read" help="default: 100">
+            <validator type="in_range" message="kmer size between 10 and 100" min="10" max="100"/>
+        </param>
+        
+    </inputs>
+    <stdio>
+        <exit_code range="1:"  level="fatal" description="Failed" />
+        <regex match="Error" 
+               source="stdout" 
+               level="fatal" 
+               description="Failed" />
+    </stdio>
+    <outputs>
+	<!-- I have not found a way to do condional outputs so all potential output files are specified and some will be empty -->
+	<data format="txt" name="trinity_coverage_normalization_log" label="${tool.name} on ${on_string}: log" />
+	<data format_source="left_input" name="output_left" label="${tool.name} on ${on_string}: Normalized left reads"> 
+            <filter>inputs['paired_or_single'] == "paired"</filter>
+        </data>
+	<data format_source="right_input" name="output_right" label="${tool.name} on ${on_string}: Normalized right reads"> 
+            <filter>inputs['paired_or_single'] == "paired"</filter>
+        </data>
+	<data format_source="input" name="output_single" label="${tool.name} on ${on_string}: Normalized reads"> 
+            <filter>inputs['paired_or_single'] == "single"</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="JM" value="1G"/>
+            <param name="max_cov" value="30"/>
+            <param name="paired_or_single" value="single"/>
+            <param name="input" ftype="fastq" value="reads.left.fq"/>
+            <param name="library_type" value="None"/>
+            <output name="trinity_coverage_normalization_log">
+                <assert_contents>
+                    <has_text text="Normalization complete." />
+                </assert_contents>
+            </output>
+            <output name="output_single">
+                <assert_contents>
+                    <has_text text="ACTGCATCCTGGAAAGAATCAATGGTGGCCGGAAAGTGTTTTTCAAATACAAGAGTGACAATGTGCCCTGTTGTTT" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="JM" value="1G"/>
+            <param name="max_cov" value="30"/>
+            <param name="paired_or_single" value="paired"/>
+            <param name="left_input" ftype="fastq" value="reads.left.fq"/>
+            <param name="right_input" ftype="fastq" value="reads.right.fq"/>
+            <param name="library_type" value="None"/>
+            <param name="pe_reads_unordered" value="False"/>
+            <param name="pairs_together" value="False"/>
+            <param name="parallel_stats" value="False"/>
+            <output name="trinity_coverage_normalization_log">
+                <assert_contents>
+                    <has_text text="Normalization complete." />
+                </assert_contents>
+            </output>
+            <output name="output_left">
+                <assert_contents>
+                    <has_text text="CTGGGCTGCAGCTAAGTTCTCTGCATCCTCCTTCTTGCTTGTGGCTGGGAAGAAGACAATGTTGTCGATGGTCTGG" />
+                </assert_contents>
+            </output>
+            <output name="output_right">
+                <assert_contents>
+                    <has_text text="CTCAAATGGTTAATTCTCAGGCTGCAAATATTCGTTCAGGATGGAAGAACATTTTCTCAGTATTCCATCTAGCTGC" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+        Runs script Trinity_ script util/normalize_by_kmer_coverage.pl which reduces data sizes with minimal impact on recovered transcripts when used by Trinity.pl.
+        
+        .. _Trinity: http://trinityrnaseq.sourceforge.net
+    </help>
+</tool>