view mzsqlite_psm_align.xml @ 1:4f8cf8fbef57 draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mzsqlite_psm_align commit b0c57cac4e558d974a16b14d4498cf8d4ba9e0c7
author galaxyp
date Thu, 19 Apr 2018 14:30:28 -0400
parents f2dc9805107a
children
line wrap: on
line source

<tool id="mzsqlite_psm_align" name="MzSQLite ProBAM ProBED" version="0.1.0">
    <description>from mz.sqlite and genomic mapping</description>
    <requirements>
        <requirement type="package">biopython</requirement>
        <requirement type="package">twobitreader</requirement>
        <requirement type="package">pysam</requirement>
        <requirement type="package">gffutils</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        #if $readlignments:
            ln -s -f '${$readlignments}' 'input.bam' &&
            ln -s -f '${$readlignments.metadata.bam_index}' 'input.bam.bai' &&
        #end if
        python '$__tool_directory__/mzsqlite_psm_align.py'  
            #if $ref.ref_source == 'cached':
                --twobit='$ref.ref_loc.fields.path'
            #elif $ref.ref_source == 'history':
                --twobit='$ref.ref_file'
            #end if
            #if $gffutilsdb:
                --gffutils_sqlite '$gffutilsdb'
            #end if
            #if $readlignments:
                --reads_bam 'input.bam'
            #end if
            #if 'probed' in str($output_formats).split(','):
               --probed '$probed'
            #end if
            #if 'prosam' in str($output_formats).split(','):
               --prosam '$prosam'
            #end if
            #if 'probam' in str($output_formats).split(','):
               --probam '$probam'
            #end if
            #if $genomicref:
               --genomeReference $genomicref
            #else
               --genomeReference $genomicdb.metadata.dbkey
            #end if
            -v
            '$mzsqlitedb' '$genomicdb'
    ]]></command>
    <inputs>
        <param name="mzsqlitedb" type="data" format="mz.sqlite" label="mz.sqlite database"
               help="generated from mzIndentML by mz_to_sqlite"/>
        <param name="genomicdb" type="data" format="sqlite" label="genomic mapping sqlite database"
               help="Genomic mapping for the Search proteins in the mzIdentML"/>
        <conditional name="ref">
            <param name="ref_source" type="select" label="Source for Genomic Sequence Data" 
                   help="Used to generate the genomic reference sequence for idenfied peptides">
                <option value="cached">Locally cached twobit</option>
                <option value="history">History dataset twobit</option>
            </param>
            <when value="cached">
                <param name="ref_loc" type="select" label="Select reference 2bit file">
                    <options from_data_table="twobit" />
                </param>
            </when>
            <when value="history">
                <param name="ref_file" type="data" format="twobit" label="reference 2bit file" />
            </when>
        </conditional>
        <param name="gffutilsdb" type="data" format="sqlite" label="gffutils sqlite database" optional="true"
               help="Categorizes the peptide by GTF feature for the proBAM XG tag" />
        <param name="readlignments" type="data" format="bam" label="read alignments bam" optional="true"
               help="Allows proBAM SEQ field to be modified with observed variants"/>
        <param name="genomicref" type="text" value="" label="Genome Reference name" optional="true" 
               help="The genome reference name to use in the proBED genomeReferenceVersion column"/>
        <param name="output_formats" type="select" display="checkboxes" label="outputs" multiple="true">
            <option value="probam" selected="true">pro.bam</option>
            <option value="prosam">pro.sam</option>
            <option value="probed">pro.bed</option>
        </param>
    </inputs>
    <outputs>
        <data name="prosam" format="sam" label="pro.SAM on ${on_string}">
            <filter>'prosam' in output_formats</filter>
        </data>
        <data name="probam" format="bam" label="pro.Bam on ${on_string}">
            <filter>'probam' in output_formats</filter>
        </data>
        <data name="probed" format="bed" label="pro.Bed on ${on_string}">
            <filter>'probed' in output_formats</filter>
        </data>
    </outputs>
    <help><![CDATA[

Generates proBAM_ or proBED_ feature alignment files for peptides identified from a mass spectrometry protein search analysis.


The tool mz_to_sqlite_ generates the a SQLite database for a mzIdentML file, 
along with the fasta search database and the spectrum files used in the search.
This mz.sqlite database is used in conjuction with a genomic mapping sqlite database 
to generate the proBAM_ or proBED_ feature alignment files.

The genomic mapping sqlite database has this schema:

::

    CREATE TABLE feature_cds_map (	/* One row for each exon in the search protein */
        name TEXT, 		/* Accession name of search protein in mzIdentML */
        chrom TEXT, 		/* Reference genome chromosome for this exon */
        start INTEGER, 		/* genomic start of the exon (zero-based like BED) */
        end INTEGER, 		/* genomic end of the exon (non-incluse like BED) */
        strand TEXT, 		/* genomic strand: '+' or '-' */
        cds_start INTEGER, 	/* The CDS coding start for this exon (zero-based) */
        cds_end INTEGER		/* The CDS coding start end this exon (non-inclusive) */
    );


Example:

::

    sqlite> select * from feature_cds_map WHERE name like 'ENSMUSP00000000001%';
    name                   chrom    start           end         strand  cds_start cds_end
    ENSMUSP00000000001      chr3    108145887       108146005       -       0       118
    ENSMUSP00000000001      chr3    108123794       108123837       -       118     161
    ENSMUSP00000000001      chr3    108123541       108123683       -       161     303
    ENSMUSP00000000001      chr3    108118300       108118458       -       303     461
    ENSMUSP00000000001      chr3    108115762       108115891       -       461     590
    ENSMUSP00000000001      chr3    108112472       108112602       -       590     720
    ENSMUSP00000000001      chr3    108111934       108112088       -       720     874
    ENSMUSP00000000001      chr3    108109421       108109612       -       874     1065

Each row represents an exon in the search protein.  
The locations: start,end, cds_start, and cds_end are **zero-based** like BED format.

The **name** field must match the **accession** name used in the mz.sqlite database 
and thus the mzIdentML search results file.

The protein positions are described in CDS base offsets rather than Animo Acids offsets 
to allow for codons being split across exons. 

This schema can describe structural variants as well as canonical transcripts.

.. _proBAM: http://www.psidev.info/probam
.. _proBED: http://www.psidev.info/probed
.. _mz_to_sqlite: https://toolshed.g2.bx.psu.edu/view/galaxyp/mz_to_sqlite/e34bdac5b157

    ]]></help>
    <citations>
        <citation type="doi">10.1186/s13059-017-1377-x</citation>
    </citations>
</tool>