view ensembl_cdna_translate.xml @ 9:4d3ac66875d2 draft default tip

Uploaded
author jjohnson
date Thu, 14 Dec 2017 13:35:00 -0500
parents 5c92d0be6514
children
line wrap: on
line source

<tool id="ensembl_cdna_translate" name="Ensembl cDNA Translations" version="0.1.0">
    <description>using Ensembl REST API</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="0.4.10">requests-cache</requirement>
        <requirement type="package" version="1.62">biopython</requirement>
        <requirement type="package" version="3.1.4">twobitreader</requirement>
    </requirements>
    <stdio>
        <exit_code range="1:" />
    </stdio>
    <command><![CDATA[
        #if $features.feature_src == 'history_bed':
            cat '$features.input' 
        #else
            python '$__tool_directory__/ensembl_cdna_translate.py'  
            #if $species:
                --species '$species'
            #end if
            $features.transcript_raw
            #if $features.biotypes:
                --biotypes '$features.biotypes'
            #end if
            #if $features.regions:
                --regions '$features.regions'
            #end if
            #if str($output_choice).find('transcript_bed') >= 0:
                --transcripts 
                #if str($output_choice).find('translation') >= 0:
                    '-' | tee '$transcript_bed'
                #else
                    '$transcript_bed'
                #end if
            #elif str($output_choice).find('translation') >= 0:
                --transcripts '-'
            #end if
        #end if
        #if str($output_choice).find('translation') >= 0:
          | python '$__tool_directory__/ensembl_cdna_translate.py' -i '-' 
            #if $ref.ref_source == 'cached':
                --twobit='$ref.ref_loc.fields.path'
            #elif $ref.ref_source == 'history':
                --twobit='$ref.ref_file'
            #end if
            --min_length $translations.min_length
            #if $translations.enzyme:
                --enzyme '$translations.enzyme'
            #end if
            $translations.translate_all
            #if $features.feature_src == 'history_bed' and str($output_choice).find('transcript_bed') >= 0:
                --transcripts '$transcript_bed'
            #end if
            #if str($output_choice).find('translation_bed') >= 0: 
                --bed '$translation_bed'
            #end if
            #if str($output_choice).find('translation_fasta') >= 0: 
                --fasta '$translation_fasta'
            #end if
            #if $features.biotypes:
                --biotypes '$features.biotypes'
            #end if
        #end if
    ]]></command>
    <inputs>
        <param name="species" type="text" value="" label="Ensembl species" >
            <help>
            </help>
            <expand macro="species_options" />
        </param>
        <conditional name="features">
            <param name="feature_src" type="select" label="Features to translate">
                <option value="ensembl_rest">Retrieve from Ensembl</option>
                <option value="history_bed">Use Ensembl BED file</option>
            </param>
            <when value="ensembl_rest">
                <param name="transcript_raw" type="boolean" truevalue="--raw" falsevalue="" checked="true" 
                    label="Keep extra columns from ensembl BED"/>
                <param name="biotypes" type="text" value="" optional="true" label="Restrict Feature retrieval to these biotypes" >
                    <expand macro="biotypes_help" />
                </param>
                <param name="regions" type="text" value="" optional="true" label="Restrict Feature retrieval to comma-separated list of regions" >
                    <help>Each region is specifed as: chr or chr:pos or chr:from-to</help>
                    <validator type="regex" message="">^(\w+(:\d+(-\d+)?)?(,\w+(:\d+(-\d+)?)?)*)?$</validator>
                </param>
            </when>
            <when value="history_bed">
                <param name="input" type="data" format="bed" optional="true" label="A BED file with 12 columns" 
                    help="thickStart and thickEnd define protein coding region, blocks define exon regions"/>
                <param name="biotypes" type="text" value="" optional="true" label="Restrict Feature translation to these biotypes" >
                    <expand macro="biotypes_help" />
                </param>
            </when>
        </conditional>
        <conditional name="ref">
            <param name="ref_source" type="select" label="Source for Genomic Sequence Data">
                <option value="cached">Locally cached twobit</option>
                <option value="history">History dataset twobit</option>
                <option value="ensembl_rest">Retrieve sequences from Ensembl (Slow and only for Ensembl Transcripts)</option>
            </param>
            <when value="cached">
                <param name="ref_loc" type="select" label="Select reference 2bit file">
                    <options from_data_table="twobit" />
                </param>
            </when>
            <when value="history">
                <param name="ref_file" type="data" format="twobit" label="reference 2bit file" />
            </when>
            <when value="ensembl_rest"/>
        </conditional>
        <section name="translations" expanded="false" title="Translation  Options">
            <param name="min_length" type="integer" value="10" min="1" label="Minimum length of protein translation to report"/>
            <param name="translate_all" type="boolean" truevalue="--all" falsevalue="" checked="false" 
                label="Report all translations (Default is non reference protein sequences)"/>
            <param name="enzyme" type="select" optional="true" label="Digest enzyme" 
                 help="Remove frags that are in a reference protein">
                <option value="trypsin">trypsin:       ([KR](?=[^P]))|((?&lt;=W)K(?=P))|((?&lt;=M)R(?=P))</option>
            </param>
        </section>
        <param name="output_choice" type="select" multiple="true" display="checkboxes" label="Outputs">
            <option value="transcript_bed">transcripts.bed</option>
            <option value="translation_bed">translation.bed</option>
            <option value="translation_fasta">translation.fasta</option>
        </param>
    </inputs>
    <outputs>
        <data name="transcript_bed" format="bed" label="Ensembl ${species} transcripts.bed">
            <filter>'transcript_bed' in output_choice</filter>
        </data>
        <data name="translation_bed" format="bed" label="Ensembl ${species} translation.bed">
            <filter>'translation_bed' in output_choice</filter>
        </data>
        <data name="translation_fasta" format="fasta" label="Ensembl ${species} translation.fasta">
            <filter>'translation_fasta' in output_choice</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="species" value="human"/>
            <param name="feature_src" value="history_bed"/>
            <param name="input" value="human_transcripts.bed" ftype="bed"/>
            <param name="ref_source" value="ensembl_rest"/>
            <param name="output_choice" value="translation_bed,translation_fasta"/>
            <output name="translation_bed">
                <assert_contents>
                    <has_text text="ENST00000641515" />
                </assert_contents>
            </output>
            <output name="translation_fasta">
                <assert_contents>
                    <has_text text=">ENST00000641515" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
        usage: ensembl_cdna_translate.py [-h] [-s SPECIES] [-i INPUT] [-t TRANSCRIPTS]
                                 [-r] [-f FASTA] [-b BED] [-m MIN_LENGTH] [-a]
                                 [-v] [-d]

Retrieve Ensembl cDNAs and three frame translate

optional arguments:
  -h, --help            show this help message and exit
  -s SPECIES, --species SPECIES
                        Ensembl Species to retrieve
  -i INPUT, --input INPUT
                        Use this bed instead of retrieving cDNA from ensembl
                        (-) for stdin
  -t TRANSCRIPTS, --transcripts TRANSCRIPTS
                        Path to output cDNA transcripts.bed (-) for stdout
  -r, --raw             Report transcript exacty as returned from Ensembl
  -f FASTA, --fasta FASTA
                        Path to output translations.fasta
  -b BED, --bed BED     Path to output translations.bed
  -m MIN_LENGTH, --min_length MIN_LENGTH
                        Minimum length of protein translation to report
  -a, --all             Report all translations (Default is non reference
                        protein sequences)
  -v, --verbose         Verbose
  -d, --debug           Debug

Esmebl REST API returns a 20 BED format with these additional columns::

  second_name, cds_start_status, cds_end_status, exon_frames, type, gene_name, second_gene_name, gene_type

    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btu613</citation>
        <citation type="doi">10.1093/nar/gku1010</citation>
    </citations>
</tool>