view pathwaymatcher.xml @ 0:1e38071ab9d5 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pathwaymatcher commit c12a99d3da62c83b779175b3c9022e7d5622053a
author galaxyp
date Wed, 20 Jun 2018 14:20:53 -0400
parents
children ee1dfac35bb3
line wrap: on
line source

<tool id="reactome_pathwaymatcher" name="Pathway Matcher" version="@PATHWAYMATCHER_VERSION@.@TOOL_SUBVERSION@">
    <description>
        PathwayMatcher is a software tool to search for pathways related to a list of proteins in Reactome.
    </description>
    <macros>
       <token name="@PATHWAYMATCHER_VERSION@">1.8</token>
       <token name="@TOOL_SUBVERSION@">0</token>
       <xml name="input_fasta">
         <param format="fasta" name="input_database" type="data" label="Protein Database"
             help="Select FASTA database from history"/>
       </xml>
    </macros>
    <requirements>
        <requirement type="package" version="@PATHWAYMATCHER_VERSION@">pathwaymatcher</requirement>
        <requirement type="package" version="3.0">zip</requirement>
    </requirements>
    <stdio>
        <exit_code range="1:" level="fatal" description="Job Failed" />
        <regex match="java.*Exception" level="fatal" description="Java Exception"/>
        <regex match="Could not create the Java virtual machine" level="fatal" description="JVM Error"/>
    </stdio>
    <command>
<![CDATA[
        #from datetime import datetime
        #import json
        #import os
        #set $exp_str = "Galaxy_Experiment_%s" % datetime.now().strftime("%Y%m%d%H%M%s")
        #set $samp_str = "Sample_%s" % datetime.now().strftime("%Y%m%d%H%M%s")
        #set $temp_stderr = "pathwaym_stderr"
        #set $bin_dir = "bin"

        mkdir output;
        cwd=`pwd`;
        export HOME=\$cwd;

        #####################
        ## Pathway Matcher ##
        #####################
        (pathwaymatcher src.main.java.no.uib.pap.pathwaymatcher.PathwayMatcher

            #for $i, $s in enumerate($input_types)

                ## GENETIC VARIANTS

                #if $s.input_type.input_type_selector == "rsid"
                    -t rsid -i '${s.input_type.input_rsid}'
                #end if

                #if $s.input_type.input_type_selector == "chrbp"
                    -t chrbp -i '${s.input_type.input_chrbp}'
                #end if

                #if $s.input_type.input_type_selector == "vcf"
                    -t vcf -i '${s.input_type.input_vcf}'
                #end if

                ## GENES

                #if $s.input_type.input_type_selector == "gene"
                    -t gene -i '${s.input_type.input_gene}'
                #end if

                ## PEPTIDES

                #if $s.input_type.input_type_selector == "peptide"
                    -t peptide -i '${s.input_type.input_peptide}'
                    -f '${s.input_type.input_database}'
                    -r '${s.input_type.ptm_range}'
                #end if

                #if $s.input_type.input_type_selector == "modifiedpeptide"
                    -t modifiedpeptide -i '${s.input_type.input_modifiedpeptide}'
                    -f '${s.input_type.input_database}'
                    -r '${s.input_type.ptm_range}'
                #end if

                ## PROTEINS

                #if $s.input_type.input_type_selector == "uniprot"
                    -t uniprot -i '${s.input_type.input_uniprot}'
                #end if

                #if $s.input_type.input_type_selector == "ensembl"
                    -t ensembl -i '${s.input_type.input_ensembl}'
                #end if

                ## PROTEOFORMS

                #if $s.input_type.input_type_selector == "proteoforms"

                    #if $s.input_type.proteoform_match_criteria:
                      -t proteoform -m '${s.input_type.proteoform_match_criteria}' -i '${s.input_type.input_proteoforms}'
                    #else:
                      -t proteoform -i '${s.input_type.input_proteoforms}'
                    #end if

                #end if

            #end for

            ## OUTPUT OPTIONS

            #if $output_options.search_top_level_info:
                -tlp
            #end if

            #set $output_graphs_list = str($output_options.output_graphs).split(',')

            #if 'gg' in $output_graphs_list:
                -gg
            #end if

            #if 'gu' in $output_graphs_list:
                -gu
            #end if

            #if 'gp' in $output_graphs_list:
                -gp
            #end if

        2>> $temp_stderr);

        ## We create a folder to contain graphs files.
        #if $output_options.output_graphs:
            mkdir "graphs";
        #end if

        #if 'gg' in $output_graphs_list:
            mv -t "graphs" "geneExternalEdges.tsv" "geneInternalEdges.tsv" "geneVertices.tsv" ;
        #end if

        #if 'gu' in $output_graphs_list:
            mv -t "graphs" "proteinExternalEdges.tsv" "proteinInternalEdges.tsv" "proteinVertices.tsv";
        #end if

        #if 'gp' in $output_graphs_list:
            mv -t "graphs" "proteoformExternalEdges.tsv" "proteoformInternalEdges.tsv" "proteoformVertices.tsv";
        #end if

        exit_code_for_galaxy=\$?;
        cat $temp_stderr 2>&1;
        (exit \$exit_code_for_galaxy)
]]>
    </command>
    <inputs>

        <repeat name="input_types" title="Input" min="1">
            <conditional name="input_type">
                  <param name="input_type_selector" type="select" label="Input type"
                      help="">
                      <option value="rsid">Genetic variants - SNP rsId list</option>
                      <option value="chrbp">Genetic variants - Chromosomes and base pairs</option>
                      <option value="vcf">Genetic variants - Variant Call Format Specification</option>
                      <option value="gene">Genes</option>
                      <option value="peptide">Peptides - Simple list</option>
                      <option value="modifiedpeptide">Peptides - Peptide List with PTM types and sites</option>
                      <option value="uniprot">Proteins - UniProt Accession list</option>
                      <option value="ensembl">Proteins - Ensembl identifier list</option>
                      <option value="proteoforms">Proteoforms</option>
                  </param>

                  <!-- Genetic variants -->
                  <when value="rsid">
                      <param format="txt" name="input_rsid" type="data" label="SNP rsId list"
                          help="The file contains one rsid identifier as defined in dbSNP[1] on each row.
                          The list must be ordered by chromosome and base pair (bp). The list must not have duplicates.
                          All rsids must appear in the human assembly GRCh37.p13. "/>
                  </when>

                  <when value="chrbp">
                      <param format="txt" name="input_chrbp" type="data" label="Chromosomes and base pairs"
                          help="Genetic variants can also be represented using the chromosome and the base pair numbers.
                          The input should be sorted by chromosome number and then by base pair.  "/>
                  </when>

                  <when value="vcf">
                      <param format="vcf" name="input_vcf" type="data" label="Variant Call Format Specification"
                          help="The input follows the Variant Call Format Specification[2] v4.3.
                          It also allows the possibility to specify only the first 4 columns in the data section of the file:
                          CHROM, POS, ID, REF.  "/>
                  </when>

                  <!-- Genes -->
                  <when value="gene">
                      <param format="txt" name="input_gene" type="data" label="Genes"
                          help="File with a one gene name in each line. Genes follow the HUGO gene nomenclature[3]."/>
                  </when>

                  <!-- Peptides  -->
                  <when value="peptide">
                      <param format="txt" name="input_peptide" type="data" label="Simple list"
                          help="File with a one peptide sequence in each line."/>

                      <expand macro="input_fasta" />

                      <param name="ptm_range" type="integer" value="0" label="PTM position range" optional="true"
                          help="Plus minus positions for the same PTM site."/>
                  </when>

                  <when value="modifiedpeptide">
                      <param format="txt" name="input_modifiedpeptide" type="data" label="Peptide List with PTM types and sites"
                         help="Each line of the file corresponds to a single peptide with post-translational modifications."/>

                      <expand macro="input_fasta" />

                      <param name="ptm_range" type="integer" value="0" label="PTM position range" optional="true"
                          help="Plus minus positions for the same PTM site."/>
                  </when>

                  <!-- Proteins  -->
                  <when value="uniprot">
                      <param format="txt" name="input_uniprot" type="data" label="UniProt Accession list"
                          help="File with a one Uniprot Accession [4] in each line."/>
                  </when>

                  <when value="ensembl">
                      <param format="txt" name="input_ensembl" type="data" label="Ensembl identifier list"
                          help="File with a one Ensembl identifier [5] in each line."/>
                  </when>

                  <!-- Proteoforms  -->
                  <when value="proteoforms">
                      <param format="txt" name="input_proteoforms" type="data" label="Proteoforms"
                          help="A proteoform defines a specific state of a protein.
                          It is composed by the protein UniProt accession, isoform and set of post translational modifications.
                          The input file contains one line for each proteoform. Each PTM is specified using a modification
                          identifier and a site, separated by ':'(semicolon). For example: '00046:133'.
                          The identifier is a 5 digit id from the PSI-MOD Protein Modification Onthology [6]."/>

                      <param name="proteoform_match_criteria" type="select" label="Proteoform match criteria">
                          <option value="STRICT">STRICT</option>
                          <option value="ONE">ONE</option>
                          <option value="SUPERSET" selected="True">SUPERSET</option>
                          <option value="SUBSET">SUBSET</option>
                      </param>
                  </when>

            </conditional>

        </repeat>

        <section name="output_options" expanded="true" title="Output options">

            <param name="search_top_level_info" type="select" label="Add search top level info">
                <option value="0" selected="True">False</option>
                <option value="1">True</option>
            </param>

            <param name="output_graphs" type="select" display="checkboxes" multiple="True" label="Connection graphs"
               help="Generates a zipped file with connection graphs as an additional output when executing the pathway search and analysis.
               The graph can use genes, proteins or proteoforms as vertices.">
                <option value="gg">Genes</option>
                <option value="gu">Proteins</option>
                <option value="gp">Proteoforms</option>
            </param>

        </section>

    </inputs>
    <outputs>
        <data name="search" format="tsv" from_work_dir="search.tsv" label="${tool.name} - search on ${on_string}" />
        <data name="analysis" format="tsv" from_work_dir="analysis.tsv" label="${tool.name} - analysis on ${on_string}" />
        <collection name="graphs_files" type="list" label="${tool.name} - graphs on ${on_string}" >
            <filter>output_options['output_graphs'] != None</filter>
            <discover_datasets pattern="__name_and_ext__" directory="graphs" ext="tsv"/>
        </collection>
    </outputs>


    <tests>

      <!-- Test that genes search works -->
      <test>
          <repeat name="input_types">
              <conditional name="input_type">
                  <param name="input_type_selector" value="gene"/>
                  <param name="input_gene" value="genes.txt" ftype="txt" />
              </conditional>
          </repeat>
          <output name="search" file="genes_search.tsv" ftype="tsv" compare="sim_size" delta="3000" />
      </test>

      <!-- Test graphs from proteoforms -->
      <test>
          <repeat name="input_types">
              <conditional name="input_type">
                  <param name="input_type_selector" value="proteoforms"/>
                  <param name="input_proteoforms" value="proteoforms.txt" ftype="txt" />
              </conditional>
          </repeat>
          <param name="output_graphs" value="gg,gu,gp" />
          <output_collection name="graphs_files" type="list">
              <element name="geneExternalEdges" ftype="tsv" file="proteoforms_graphs/geneExternalEdges.tsv" compare="sim_size" delta="1000" />
              <element name="geneInternalEdges" ftype="tsv" file="proteoforms_graphs/geneInternalEdges.tsv" compare="sim_size" delta="1000"/>
              <element name="geneVertices" ftype="tsv" file="proteoforms_graphs/geneVertices.tsv" compare="sim_size" delta="1000"/>
              <element name="proteinExternalEdges" ftype="tsv" file="proteoforms_graphs/proteinExternalEdges.tsv" compare="sim_size" delta="10000"/>
              <element name="proteinInternalEdges" ftype="tsv" file="proteoforms_graphs/proteinInternalEdges.tsv" compare="sim_size" delta="1000"/>
              <element name="proteinVertices" ftype="tsv" file="proteoforms_graphs/proteinVertices.tsv" compare="sim_size" delta="1000"/>
              <element name="proteoformExternalEdges" ftype="tsv" file="proteoforms_graphs/proteoformExternalEdges.tsv" compare="sim_size" delta="1000"/>
              <element name="proteoformInternalEdges" ftype="tsv" file="proteoforms_graphs/proteoformInternalEdges.tsv" compare="sim_size" delta="1000"/>
              <element name="proteoformVertices" ftype="tsv" file="proteoforms_graphs/proteoformVertices.tsv" compare="sim_size" delta="1000"/>
          </output_collection>
      </test>

    </tests>
    <help>

.. class:: infomark

**Introduction**

Biological pathways are an excellent resource to analyze the causes and consequences of certain phenotypes.
Most of the components of the pathways are proteins. When searching for relevant pathways to perform analysis
of a patient sample proteins, it is very common to lose information due to lack of precision in the search.

This leads to result sets with many extra selected pathways that are not really related to the input sample.

.. class:: infomark

**What it does**

We present more fine grained approach to search, not only with the gene names, but also with post translational
modifications of the proteins, such as phosphorylation.

Ultimately, any omics dataset with its mutations and
modifications will be mapped directly to the functional knowledgebases allowing the functional interpretation by
researchers and clinicians.

The reference database used is Reactome, a free, open source, curated and peer reviewed database of biological reactions, that contains the quality data needed for this type of fine grained search. database of biological reactions. It can be readily queried with omics datasets, and we are improving its features by extending the matching the clinical data to the biological pathways. Not only will the gene names be used, but also mutations or post translational modifications such as phosphorylation.


.. class:: infomark

**Inputs and outputs**

PathwayMatcher can search for reactions and pathways with various input types, and generates mapping files to the database.

The input can be:

- Genetic variants
- Genes
- Peptides
- Protein
- Proteoforms

The output of PathwayMatcher is composed of two files, the Reaction and Pathway mapping and the statistical analysis of the relevant pathways.

.. class:: infomark

Information included with this tool is a brief summary of the main one included in PathwayMatcher_.

Specific information about PathwayMatcher's Input_ and Output_ may also be found there.


.. class:: infomark

**References**

[1] dbSNP_

[2] VCF v4.3:
http://samtools.github.io/hts-specs/VCFv4.3.pdf

[3] genenames.org: the HGNC resources in 2015. Nucleic Acids Res. 2015 Jan;43(Database issue):D1079-85. doi: 10.1093/nar/gku1071. :
https://www.ncbi.nlm.nih.gov/pubmed/25361968

[4] UniProt: the universal protein knowledgebase. Nucleic Acids Res. 45: D158-D169 (2017):
http://dx.doi.org/doi:10.1093/nar/gkw1099

[5] Ensembl:
https://www.ensembl.org/info/genome/stable_ids/index.html

[6] The PSI-MOD community standard for representation of protein modification data. Nature Biotechnology 26, 864 - 866 (2008):
http://www.nature.com/nbt/journal/v26/n8/full/nbt0808-864.html

.. _dbSNP: https://www.ncbi.nlm.nih.gov/projects/SNP/
.. _PathwayMatcher: https://github.com/LuisFranciscoHS/PathwayMatcher
.. _Input: https://github.com/LuisFranciscoHS/PathwayMatcher/wiki/Input
.. _Output: https://github.com/LuisFranciscoHS/PathwayMatcher/wiki/Output

    </help>

</tool>