view ensembl_cdna_translate.xml @ 7:d59e3ce10e74 draft

Uploaded
author jjohnson
date Wed, 13 Dec 2017 11:15:34 -0500
parents f8e0e5e237a5
children 5c92d0be6514
line wrap: on
line source

<tool id="ensembl_cdna_translate" name="Ensembl cDNA Translations" version="0.1.0">
    <description>using Ensembl REST API</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="0.4.10">requests-cache</requirement>
        <requirement type="package" version="1.62">biopython</requirement>
    </requirements>
    <stdio>
        <exit_code range="1:" />
    </stdio>
    <command><![CDATA[
        #if $features.feature_src == 'history_bed':
            cat '$features.input' 
        #else
            python '$__tool_directory__/ensembl_cdna_translate.py'  
            #if $species:
                --species '$species'
            #end if
            $features.transcript_raw
            #if $features.biotypes:
                --biotypes '$features.biotypes'
            #end if
            #if $features.regions:
                --regions '$features.regions'
            #end if
            #if str($output_choice).find('transcript_bed') >= 0:
                --transcripts 
                #if str($output_choice).find('translation') >= 0:
                    '-' | tee '$transcript_bed'
                #else
                    '$transcript_bed'
                #end if
            #elif str($output_choice).find('translation') >= 0:
                --transcripts '-'
            #end if
        #end if
        #if str($output_choice).find('translation') >= 0:
          | python '$__tool_directory__/ensembl_cdna_translate.py' -i '-' 
            --min_length $min_length
            #if $enzyme:
                --enzyme '$enzyme'
            #end if
            #if $input and str($output_choice).find('transcript_bed') >= 0:
                --transcripts '$transcript_bed'
            #end if
            #if str($output_choice).find('translation_bed') >= 0: 
                --bed '$translation_bed'
            #end if
            #if str($output_choice).find('translation_fasta') >= 0: 
                --fasta '$translation_fasta'
            #end if
            ###if $features.biotypes:
            ##    --biotypes '$features.biotypes'
            ###end if
        #end if
       
    ]]></command>
    <inputs>
        <param name="species" type="text" value="" label="Ensembl species" >
            <help>
            </help>
            <expand macro="species_options" />
        </param>
        <conditional name="features">
            <param name="feature_src" type="select" label="Features to translate">
                <option value="ensembl_rest">Retrieve from Ensembl</option>
                <option value="history_bed">Use Ensembl BED file</option>
            </param>
            <when value="ensembl_rest">
                <param name="transcript_raw" type="boolean" truevalue="--raw" falsevalue="" checked="true" 
                    label="Keep extra columns from ensembl BED"/>
                <param name="biotypes" type="text" value="" optional="true" label="Restrict Feature retrieval to these biotypes" >
                    <expand macro="biotypes_help" />
                </param>
                <param name="regions" type="text" value="" optional="true" label="Restrict Feature retrieval to comma-separated list of regions" >
                    <help>Each region is specifed as: chr or chr:pos or chr:from-to</help>
                    <validator type="regex" message="">^(\w+(:\d+(-\d+)?)?(,\w+(:\d+(-\d+)?)?)*)?$</validator>
                </param>
            </when>
            <when value="history_bed">
                <param name="input" type="data" format="bed" optional="true" label="A BED file with 12 columns" 
                    help="thickStart and thickEnd define protein coding region, blocks define exon regions"/>
                <param name="biotypes" type="text" value="" optional="true" label="Restrict Feature translation to these biotypes" >
                    <expand macro="biotypes_help" />
                </param>
            </when>
        </conditional>
        <!--
        <conditional name="translations">
        </conditional>
        -->


        <param name="translate_all" type="boolean" truevalue="--all" falsevalue="" checked="false" 
            label="Report all translations (Default is non reference protein sequences)"/>
        <param name="output_choice" type="select" multiple="true" display="checkboxes" label="Outputs">
            <option value="transcript_bed">transcripts.bed</option>
            <option value="translation_bed">translation.bed</option>
            <option value="translation_fasta">translation.fasta</option>
        </param>
        <param name="min_length" type="integer" value="7" min="1" label="Minimum length of protein translation to report"/>
        <param name="enzyme" type="select" optional="true" label="Digest enzyme" 
             help="Remove frags that are in a reference protein">
            <option value="trypsin">trypsin:       ([KR](?=[^P]))|((?&lt;=W)K(?=P))|((?&lt;=M)R(?=P))</option>
        </param>
    </inputs>
    <outputs>
        <data name="transcript_bed" format="bed" label="Ensembl $species transcripts.bed">
            <filter>'transcript_bed' in output_choice</filter>
        </data>
        <data name="translation_bed" format="bed" label="Ensembl $species translation.bed">
            <filter>'translation_bed' in output_choice</filter>
        </data>
        <data name="translation_fasta" format="fasta" label="Ensembl $species translation.fasta">
            <filter>'translation_fasta' in output_choice</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="species" value="human"/>
            <param name="input" value="human_transcripts.bed" ftype="bed"/>
            <param name="output_choice" value="translation_bed,translation_fasta"/>
            <output name="translation_bed">
                <assert_contents>
                    <has_text text="ENST00000641515" />
                </assert_contents>
            </output>
            <output name="translation_fasta">
                <assert_contents>
                    <has_text text=">ENST00000641515" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
        usage: ensembl_cdna_translate.py [-h] [-s SPECIES] [-i INPUT] [-t TRANSCRIPTS]
                                 [-r] [-f FASTA] [-b BED] [-m MIN_LENGTH] [-a]
                                 [-v] [-d]

Retrieve Ensembl cDNAs and three frame translate

optional arguments:
  -h, --help            show this help message and exit
  -s SPECIES, --species SPECIES
                        Ensembl Species to retrieve
  -i INPUT, --input INPUT
                        Use this bed instead of retrieving cDNA from ensembl
                        (-) for stdin
  -t TRANSCRIPTS, --transcripts TRANSCRIPTS
                        Path to output cDNA transcripts.bed (-) for stdout
  -r, --raw             Report transcript exacty as returned from Ensembl
  -f FASTA, --fasta FASTA
                        Path to output translations.fasta
  -b BED, --bed BED     Path to output translations.bed
  -m MIN_LENGTH, --min_length MIN_LENGTH
                        Minimum length of protein translation to report
  -a, --all             Report all translations (Default is non reference
                        protein sequences)
  -v, --verbose         Verbose
  -d, --debug           Debug

Esmebl REST API returns a 20 BED format with these additional columns::

  second_name, cds_start_status, cds_end_status, exon_frames, type, gene_name, second_gene_name, gene_type

    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btu613</citation>
        <citation type="doi">10.1093/nar/gku1010</citation>
    </citations>
</tool>