Mercurial > repos > jjohnson > trinityrnaseq
changeset 2:5eb99d21ef0d
Add trinityrnaseq_norm and transcriptsToOrfs tools
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Thu, 05 Sep 2013 08:08:21 -0500 |
parents | a34ce2b18877 |
children | a9d882069cd4 |
files | tool-data/pfam_db.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml transcriptsToOrfs.xml trinityrnaseq.xml trinityrnaseq_norm.xml |
diffstat | 6 files changed, 309 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/pfam_db.loc.sample Thu Sep 05 08:08:21 2013 -0500 @@ -0,0 +1,3 @@ +#release_name release name path +#Pfam27.0_Pfam-A Pfam27.0 Pfam-A /data/pfam/Pfam-A.hmm +#Pfam27.0_Pfam-B Pfam27.0 Pfam-B /data/pfam/Pfam-B.hmm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Sep 05 08:08:21 2013 -0500 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of all fasta files under genome directory --> + <table name="pfam_databases" comment_char="#"> + <columns>value, release, name, path</columns> + <file path="tool-data/pfam_db.loc" /> + </table> +</tables>
--- a/tool_dependencies.xml Fri Aug 30 10:56:26 2013 -0500 +++ b/tool_dependencies.xml Thu Sep 05 08:08:21 2013 -0500 @@ -9,5 +9,8 @@ <package name="bowtie" version="1.0.0"> <repository changeset_revision="e682af6a72cd" name="package_bowtie_1_0_0" owner="jjohnson" toolshed="http://testtoolshed.g2.bx.psu.edu" /> </package> + <package name="hmmer" version="3.0"> + <repository changeset_revision="3bc37773c609" name="package_hmmer_3_0" owner="jjohnson" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> </tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/transcriptsToOrfs.xml Thu Sep 05 08:08:21 2013 -0500 @@ -0,0 +1,117 @@ +<tool id="transcriptsToOrfs" name="transcriptsToOrfs" version="0.0.2"> + <description>Trinity Transcripts to Candidate Peptides</description> + <requirements> + <requirement type="package" version="2013_08_14">trinityrnaseq</requirement> + <requirement type="package" version="3.0">hmmer</requirement> + </requirements> + <command> + \$TRINITY_HOME/trinity-plugins/transdecoder/transcripts_to_best_scoring_ORFs.pl + -t $transcripts + #if $min_prot_length: + -m $min_prot_length + #end if + #if $retain_long_orfs: + --retain_long_orfs $retain_long_orfs + #end if + #if $training_count: + -T $training_count + #end if + #if str($strand_specificity) == 'SS': + -S + #end if + #if $genetic_code.__str__ != '': + -G $genetic_code + #end if + #if $search.use_pfam == 'yes': + --search_pfam "${ filter( lambda x: str( x[0] ) == str( $search.pfam_db ), $__app__.tool_data_tables[ 'pfam_databases' ].get_fields() )[0][-1] }" + --CPU $search.CPU + #end if + </command> + <inputs> + <param format="fasta" name="transcripts" type="data" label="Transcripts sequences in fastA format" help="" /> + <param name="min_prot_length" type="integer" value="" optional="true" label="Minimum peptide length (in amino acids)" help="default: 100"> + <validator type="in_range" message="Minimum peptide length should be at least 50" min="50" /> + </param> + <param name="retain_long_orfs" type="integer" value="" optional="true" label="Retain all ORFs found that are of minimum length in nucleotides" help="default: 900" > + <validator type="in_range" message="ORF length should be at least 50" min="50" /> + </param> + <param name="training_count" type="integer" value="" optional="true" label="Number of top longest ORFs to train Markov Model (hexamer stats)" help="default: 500" > + <validator type="in_range" message="ORF count should be at least 50" min="50" /> + </param> + <param name="strand_specificity" type="select" label="Strand specificity type"> + <option value="DS">NOT strand specific, examine both strands</option> + <option value="SS">Strand specific, examine only top strand</option> + </param> + <param name="genetic_code" type="select" label="Genetic Code"> + <option value="">use default(universal)</option> + <option value="universal">universal</option> + <option value="Euplotes">Euplotes</option> + <option value="Tetrahymena">Tetrahymena</option> + <option value="Candida">Candida</option> + <option value="Acetabularia">Acetabularia</option> + </param> + <conditional name="search"> + <param name="use_pfam" type="select" label="Search PFAM database"> + <option value="no">NO</option> + <option value="yes">YES</option> + </param> + <when value="no"/> + <when value="yes"> + <param name="pfam_db" type="select" label="Pfam database"> + <options from_data_table="pfam_databases" /> + </param> + <param name="CPU" type="integer" value="2" min="1" label="CPU" help="Number of CPUs to use by hmmscan" /> + </when> + </conditional> + </inputs> + <stdio> + <exit_code range="1:" level="fatal" description="Failed" /> + <regex match="Error" + source="stderr" + level="fatal" + description="Failed" /> + </stdio> + <outputs> + <data format="txt" name="trinity_pep_pfam" label="${tool.name} on ${on_string}: Pfam matches to Candidate Peptide Sequences" from_work_dir="longest_orfs.pep.pfam.dat"> + <filter>search['use_pfam'] == 'yes'</filter> + </data> + <data format="gff3" name="trinity_pep_gff3" label="${tool.name} on ${on_string} Candidate Peptide Features" from_work_dir="best_candidates.eclipsed_orfs_removed.gff3" /> + <data format="bed" name="trinity_pep_bed" label="${tool.name} on ${on_string} Candidate Peptide Coordinates" from_work_dir="best_candidates.eclipsed_orfs_removed.bed" /> + <data format="fasta" name="trinity_pep_cds" label="${tool.name} on ${on_string}: Candidate Peptide CDS Sequences" from_work_dir="best_candidates.eclipsed_orfs_removed.cds"/> + <data format="fasta" name="trinity_pep_seqs" label="${tool.name} on ${on_string}: Candidate Peptide Sequences" from_work_dir="best_candidates.eclipsed_orfs_removed.pep"/> + </outputs> + <tests> + <test> + <param name="transcripts" ftype="fasta" value="TrinitySingle.fasta"/> + <param name="min_prot_length" value="100"/> + <param name="use_pfam" value="no"/> + <output name="trinity_pep_seqs"> + <assert_contents> + <has_text text="WAAKAWLITARSLYPADF" /> + </assert_contents> + </output> + <output name="trinity_pep_cds"> + <assert_contents> + <has_text text="TGGGCAGCCAAGGCATGGCTGATCACGGCCCGCA" /> + </assert_contents> + </output> + <output name="trinity_pep_bed"> + <assert_contents> + <has_text text="comp10_c0_seq1" /> + </assert_contents> + </output> + <output name="trinity_pep_gff3"> + <assert_contents> + <has_text text="comp10_c0_seq1" /> + </assert_contents> + </output> + </test> + </tests> + <help> + ** transcriptsToOrfs ** + Trinity_ is a de novo transcript assembler that uses RNA-seq data as input. + This tool searches for open reading frames in the assembled transcripts. + + .. _Trinity: http://trinityrnaseq.sourceforge.net + </help> +</tool>
--- a/trinityrnaseq.xml Fri Aug 30 10:56:26 2013 -0500 +++ b/trinityrnaseq.xml Thu Sep 05 08:08:21 2013 -0500 @@ -110,7 +110,7 @@ </conditional> </inputs> <stdio> - <exit_code range="1:" level="fatal" description="Faiiled" /> + <exit_code range="1:" level="fatal" description="Failed" /> <regex match="command not found" source="both" level="fatal"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trinityrnaseq_norm.xml Thu Sep 05 08:08:21 2013 -0500 @@ -0,0 +1,178 @@ +<tool id="trinityrnaseq_norm" name="Trinity read normalization" version="0.0.2"> + <!-- Written by Jeremy Goecks, modified by Josh Bowden for normalization proceedure, now maintained here by bhaas --> + <description>Pre-process RNA-seq data to reduce coverage of highly covered areas</description> + <requirements> + <requirement type="package" version="2013_08_14">trinityrnaseq</requirement> + </requirements> + <command> + ## symlink input in work_dir + #if str($inputs.paired_or_single) == "paired": + ln -s $inputs.left_input left_reads && + ln -s $inputs.right_input right_reads && + #else: + ln -s $inputs.input single_reads && + #end if + \${TRINITY_HOME}/util/normalize_by_kmer_coverage.pl --JM $JM --max_cov $max_cov + ## Inputs. + #if str($inputs.paired_or_single) == "paired": + --left left_reads --right right_reads + #if $inputs.left_input.ext == 'fa': + --seqType fa + #else: + --seqType fq + #end if + $inputs.pe_reads_unordered + #if str($inputs.library_type) != "None": + --SS_lib_type $inputs.library_type + #end if + $inputs.pairs_together + $inputs.parallel_stats + #else: + --single single_reads + #if str($inputs.input.ext) == 'fa': + --seqType fa + #else: + --seqType fq + #end if + #if str($inputs.library_type) != "None": + --SS_lib_type $inputs.library_type + #end if + #end if + #if $kmer_size: + --KMER_SIZE $kmer_size + #end if + #if $max_pct_stdev: + --max_pct_stdev $max_pct_stdev + #end if + ## direct stdio to output + | tee $trinity_coverage_normalization_log && + #if str($inputs.paired_or_single) == "paired": + cp left_reads.normalized* $output_left && + cp right_reads.normalized* $output_right + #else: + cp single_reads.normalized* $output_single + #end if + </command> + <inputs> + <param name="JM" type="select" label="JM" help="Amount of memory to allocate to Jellyfish for Kmer catalog construction"> + <option value="1G">1G</option> + <option value="10G">10G</option> + <option value="20G">20G</option> + <option value="50G">50G</option> + <option value="100G">100G</option> + </param> + + <param name="max_cov" type="select" label="max_cov" help="Read coverage in terms of maximum covarge to keep"> + <option value="30">30</option> + <option value="40">40</option> + <option value="50">50</option> + <option value="60">60</option> + <option value="70">70</option> + <option value="100">100</option> + </param> + + <conditional name="inputs"> + <param name="paired_or_single" type="select" label="Paired or Single-end data?"> + <option value="paired">Paired</option> + <option value="single">Single</option> + </param> + <when value="paired"> + <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/> + <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/> + <param name="library_type" type="select" label="Strand-specific Library Type"> + <option value="None">None</option> + <option value="FR">FR</option> + <option value="RF">RF</option> + </param> + <param name="pe_reads_unordered" type="boolean" truevalue="--PE_reads_unordered" falsevalue="" checked="false" label="set if the input paired-end reads are not identically ordered"/> + <param name="pairs_together" type="boolean" truevalue="--pairs_together" falsevalue="" checked="false" label="process paired reads by averaging stats between pairs and retaining linking info"/> + <param name="parallel_stats" type="boolean" truevalue="--PARALLEL_STATS" falsevalue="" checked="false" label="generate read stats in parallel for paired reads" help="(Figure 2X Inchworm memory requirement)"/> + </when> + <when value="single"> + <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/> + <param name="library_type" type="select" label="Strand-specific Library Type"> + <option value="None">None</option> + <option value="F">F</option> + <option value="R">R</option> + </param> + </when> + </conditional> + <param name="kmer_size" type="integer" value="" optional="true" label="KMER SIZE" help="default: 25"> + <validator type="in_range" message="kmer size between 3 and 200" min="3" max="200"/> + </param> + <param name="max_pct_stdev" type="integer" value="" optional="true" label="maximum pct of mean for stdev of kmer coverage across read" help="default: 100"> + <validator type="in_range" message="kmer size between 10 and 100" min="10" max="100"/> + </param> + + </inputs> + <stdio> + <exit_code range="1:" level="fatal" description="Failed" /> + <regex match="Error" + source="stdout" + level="fatal" + description="Failed" /> + </stdio> + <outputs> + <!-- I have not found a way to do condional outputs so all potential output files are specified and some will be empty --> + <data format="txt" name="trinity_coverage_normalization_log" label="${tool.name} on ${on_string}: log" /> + <data format_source="left_input" name="output_left" label="${tool.name} on ${on_string}: Normalized left reads"> + <filter>inputs['paired_or_single'] == "paired"</filter> + </data> + <data format_source="right_input" name="output_right" label="${tool.name} on ${on_string}: Normalized right reads"> + <filter>inputs['paired_or_single'] == "paired"</filter> + </data> + <data format_source="input" name="output_single" label="${tool.name} on ${on_string}: Normalized reads"> + <filter>inputs['paired_or_single'] == "single"</filter> + </data> + </outputs> + <tests> + <test> + <param name="JM" value="1G"/> + <param name="max_cov" value="30"/> + <param name="paired_or_single" value="single"/> + <param name="input" ftype="fastq" value="reads.left.fq"/> + <param name="library_type" value="None"/> + <output name="trinity_coverage_normalization_log"> + <assert_contents> + <has_text text="Normalization complete." /> + </assert_contents> + </output> + <output name="output_single"> + <assert_contents> + <has_text text="ACTGCATCCTGGAAAGAATCAATGGTGGCCGGAAAGTGTTTTTCAAATACAAGAGTGACAATGTGCCCTGTTGTTT" /> + </assert_contents> + </output> + </test> + <test> + <param name="JM" value="1G"/> + <param name="max_cov" value="30"/> + <param name="paired_or_single" value="paired"/> + <param name="left_input" ftype="fastq" value="reads.left.fq"/> + <param name="right_input" ftype="fastq" value="reads.right.fq"/> + <param name="library_type" value="None"/> + <param name="pe_reads_unordered" value="False"/> + <param name="pairs_together" value="False"/> + <param name="parallel_stats" value="False"/> + <output name="trinity_coverage_normalization_log"> + <assert_contents> + <has_text text="Normalization complete." /> + </assert_contents> + </output> + <output name="output_left"> + <assert_contents> + <has_text text="CTGGGCTGCAGCTAAGTTCTCTGCATCCTCCTTCTTGCTTGTGGCTGGGAAGAAGACAATGTTGTCGATGGTCTGG" /> + </assert_contents> + </output> + <output name="output_right"> + <assert_contents> + <has_text text="CTCAAATGGTTAATTCTCAGGCTGCAAATATTCGTTCAGGATGGAAGAACATTTTCTCAGTATTCCATCTAGCTGC" /> + </assert_contents> + </output> + </test> + </tests> + <help> + Runs script Trinity_ script util/normalize_by_kmer_coverage.pl which reduces data sizes with minimal impact on recovered transcripts when used by Trinity.pl. + + .. _Trinity: http://trinityrnaseq.sourceforge.net + </help> +</tool>