Mercurial > repos > galaxyp > meta_proteome_analyzer

<tool id="meta_proteome_analyzer" name="MetaProteomeAnalyzer" version="@TOOL_VERSION@+galaxy@WRAPPER_VERSION@">
    <description>
        functional and taxonomic characterization of proteins
    </description>
    <macros>
        <token name="@TOOL_VERSION@">2.0.0</token>
        <token name="@WRAPPER_VERSION@">1</token>
        <xml name="test_output" token_name="" token_has_text="">
            <output name="output_PSMs">
               <assert_contents>
                    <has_text text="A2SPK1" />
               </assert_contents>
               <yield/>
            </output>
        </xml>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">mpa-portable</requirement>
    </requirements>
    <stdio>
        <regex match="Could not allocate metaspace" source="both" level="fatal_oom" description="Insufficient memory" />
        <regex match="OutOfMemoryError" source="both" level="fatal_oom" description="Insufficient memory" />

    </stdio>
    <command detect_errors="exit_code">
<![CDATA[
        mkdir -p output_dir &&
        ## copy mpa conf dir to working dir
        jar_dir=`mpa-portable -get_jar_dir` &&

        cp -R \$jar_dir/conf . &&

        ## echo the search engines to run
        #set $search_engines = str($engines).split(',')
        echo "$engines" &&
        echo "DB: ${input_database.display_name} sequences: ${input_database.metadata.sequences}" &&

        #for $mgf in $peak_lists:
            #set $input_name = $mgf.display_name.split('/')[-1].replace(".mgf", "") + ".mgf"
            ln -s -f '${mgf}' '${input_name}' &&
            #set $encoded_id = $__app__.security.encode_id($mgf.id)
            echo "Spectrums:${mgf.display_name}(API:${encoded_id}) " &&
        #end for
        cp "${input_database}" input_database.fasta &&

        ######################
        ## MPA ##
        ######################
        mpa-portable de.mpa.cli.CmdLineInterface --exec_dir=exec_dir -Xmx2048m
            -spectrum_files "\$(pwd)"
            -database input_database.fasta
            -missed_cleav $missed_cleavages
            -prec_tol ${precursor_options.prec_tol}${precursor_options.prec_tol_units}
            -frag_tol ${precursor_options.frag_tol}Da
            -xtandem #if 'X!Tandem' in $search_engines then 1 else 0#
            -comet #if 'Comet' in $search_engines then 1 else 0#
            -msgf #if 'MSGF' in $search_engines then 1 else 0#
            -generate_metaproteins $generate_metaproteins
            -iterative_search $iterative_search
            -fragment_method $fragment_method
            -peptide_index $peptide_index
            -fdr_threshold $fdr_threshold
            -semi_tryptic $semi_tryptic
            -output_folder output_dir
            -threads "\${GALAXY_SLOTS:-12}" &&
        #if $generate_metaproteins == "1"
            mv ./output_dir/*_metaproteins.csv metaproteins.csv &&
            mv ./output_dir/*_metaprotein_taxa.csv metaprotein_taxa.csv &&
        #end if
        mv ./output_dir/*_peptides.csv peptides.csv &&
        mv ./output_dir/*_proteins.csv proteins.csv &&
        mv ./output_dir/*_psms.csv psms.csv &&
        mv ./output_dir/*_spectrum_ids.csv spectrum_ids.csv
]]>
    </command>
    <inputs>
        <param format="fasta" name="input_database" type="data" label="Protein Database"
            help="Select FASTA database from history"/>
        <param name="peak_lists" format="mgf" type="data" multiple="true" label="Input Peak Lists (mgf)"
            help="Select appropriate MGF dataset(s) from history" />
        <param name="missed_cleavages" type="integer" value="2" label="Maximum Missed Cleavages"
            help="Allow peptides to contain up to this many missed enzyme cleavage sites."/>
        <section name="precursor_options" expanded="false" title="Precursor Options">
            <param name="prec_tol_units" type="select" label="Precursor Ion Tolerance Units"
                help="Select based on instrument used, as different machines provide different quality of spectra. ppm is a standard for most precursor ions">
                <option value="ppm">Parts per million (ppm)</option>
                <option value="Da">Daltons</option>
            </param>
            <param name="prec_tol" type="float" value="10" label="Percursor Ion Tolerance"
                help="Provide error value for precursor ion, based on instrument used. 10 ppm recommended for Orbitrap instrument"/>
            <param name="frag_tol" type="float" value="0.5" label="Fragment Tolerance (Daltons)"
                help="Provide error value for fragment ions, based on instrument used"/>
        </section>
        <!-- Search Engine Selection -->
        <param name="engines" type="select" display="checkboxes" multiple="True" label="DB-Search Engines">
            <help>Comet and Tide shouldn't both be selected since they use a similar algoritm.</help>
            <option value="X!Tandem" selected="True">X!Tandem</option>
            <option value="MSGF">MS-GF+</option>
            <option value="Comet">Comet</option>
        </param>
        <param argument="-generate_metaproteins" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Meta-protein generation" help="aka. protein grouping"/>
        <param argument="-iterative_search" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Iterative searching" help="aka. two-step searching"/>
        <param argument="-semi_tryptic" type="boolean" truevalue="1" falsevalue="0" checked="false" label="" help="Semi-tryptic cleavage"/>
        <param argument="-fragment_method" type="select" label="Fragmentation method" help="for the MS instrument">
            <option value="1" selected="true">CID</option>
            <option value="2">HCD</option>
            <option value="3">ETD</option>
        </param>
        <param argument="-peptide_index" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Peptide indexing (of FASTA database)" help=""/>
        <param argument="-fdr_threshold" type="float" value="0.05" min="0" max="1" label="FDR threshold for filtering" help=""/>
    </inputs>
    <outputs>
        <data format="tabular" name="output_proteins" from_work_dir="proteins.csv" label="${tool.name} on ${on_string}: proteins">
            <actions>
                <action name="comment_lines" type="metadata" default="1" />
                <action name="column_names" type="metadata" default="Protein_No,Protein_Accession,Protein_Description,Protein_Taxonomy,Sequence_Coverage,Peptide_Count,NSAF,emPAI,Spectral_Count,Isoelectric_Point,Molecular_Weight,Protein_Sequence,Peptides" />
            </actions>
        </data>
        <data format="tabular" name="output_peptides" from_work_dir="peptides.csv" label="${tool.name} on ${on_string}: peptides">
            <actions>
                <action name="comment_lines" type="metadata" default="1" />
                <action name="column_names" type="metadata" default="Peptide_Num,Protein_Accessions,Peptide_Sequence,Protein_Count,Spectral_Count,Taxonomic_Group,Taxonomic_Rank,NCBI_Taxonomy_ID" />
            </actions>
        </data>
        <data format="tabular" name="output_PSMs" from_work_dir="psms.csv" label="${tool.name} on ${on_string}: PSMs">
            <actions>
                <action name="comment_lines" type="metadata" default="1" />
                <action name="column_names" type="metadata" default="PSM_Num,Protein_Accessions,Peptide_Sequence,Spectrum_Title,Charge,Search_Engine,q-value,Score" />
            </actions>
        </data>
        <data format="tabular" name="output_spectrum_ids" from_work_dir="spectrum_ids.csv" label="${tool.name} on ${on_string}: spectrum_ids">
            <actions>
                <action name="comment_lines" type="metadata" default="1" />
                <action name="column_names" type="metadata" default="Spectrum_Number,Spectrum_ID,Spectrum_Title,Peptides,Protein_Accessions" />
            </actions>
        </data>
        <data format="tabular" name="output_metaproteins" from_work_dir="metaproteins.csv" label="${tool.name} on ${on_string}: metaproteins">
            <filter>generate_metaproteins</filter>
            <actions>
                <action name="comment_lines" type="metadata" default="1" />
                <action name="column_names" type="metadata" default="Meta-Protein_Num,Meta-Protein_Accession,Meta-Protein_Description,Meta-Protein_Taxonomy,Meta-Protein_UniRef100,Meta-Protein_UniRef90,Meta-Protein_UniRef50,Meta-Protein_KO,Meta-Protein_EC,Peptide_Count,Spectral_Count,Proteins,Peptides" />
            </actions>
        </data>
        <data format="tabular" name="output_metaprotein_taxa" from_work_dir="metaprotein_taxa.csv" label="${tool.name} on ${on_string}: metaprotein_taxa">
            <filter>generate_metaproteins</filter>
            <actions>
                <action name="comment_lines" type="metadata" default="1" />
                <action name="column_names" type="metadata" default="Unclassified,Superkingdom,Kingdom,Phylum,Class,Order,Family,Genus,Species,Subspecies,Num_Peptides,Spectral_Count" />
            </actions>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="6">
            <param name="peak_lists" value="Test416Ebendorf.mgf" ftype="mgf"/>
            <param name="input_database" value="searchdb.fa" ftype="fasta"/>
            <param name="missed_cleavages" value="2"/>
            <param name="prec_tol" value="ppm"/>
            <param name="prec_tol" value="10"/>
            <param name="frag_tol" value="0.5"/>
            <param name="engines" value="X!Tandem,MSGF,Comet"/>
            <expand macro="test_output" name="output_proteins" has_text="B8GJQ7"/>
            <expand macro="test_output" name="output_peptides" has_text="B8GJQ7"/>
            <expand macro="test_output" name="output_PSMs" has_text="B8GJQ7">
                <assert_contents>
                    <has_text text="X!Tandem" />
                </assert_contents>
            </expand>
            <expand macro="test_output" name="output_spectrum_ids" has_text="B8GJQ7"/>
            <expand macro="test_output" name="output_metaproteins" has_text="B8GJQ7"/>
            <expand macro="test_output" name="output_metaprotein_taxa" has_text="Unknown Superkingdomq"/>
        </test>
        <test expect_num_outputs="4">
            <param name="peak_lists" value="Test416Ebendorf.mgf" ftype="mgf"/>
            <param name="input_database" value="searchdb.fa" ftype="fasta"/>
            <param name="missed_cleavages" value="2"/>
            <param name="prec_tol" value="ppm"/>
            <param name="prec_tol" value="10"/>
            <param name="frag_tol" value="0.5"/>
            <param name="engines" value="MSGF,Comet"/>
            <param name="generate_metaproteins" value="0"/>
            <expand macro="test_output" name="output_proteins" has_text="B8GJQ7"/>
            <expand macro="test_output" name="output_peptides" has_text="B8GJQ7"/>
            <expand macro="test_output" name="output_PSMs" has_text="B8GJQ7">
                <assert_contents>
                    <not_has_text text="X!Tandem" />
                </assert_contents>
            </expand>
            <expand macro="test_output" name="output_spectrum_ids" has_text="B8GJQ7"/>
            <expand macro="test_output" name="output_metaproteins" has_text="B8GJQ7"/>
            <expand macro="test_output" name="output_metaprotein_taxa" has_text="Unknown Superkingdomq"/>
        </test>
    </tests>
    <help>
**What it does**

=======

MetaProteomeAnalyzer (MPA) performs identification of proteins and in-depth analysis of metaproteomics (and also proteomics) data. The MPA software currently supports the database search engines Comet, MS-GF+ and X!Tandem taking MGF spectrum files as input data. User-provided FASTA databases (preferably downloaded from UniProtKB) are formatted automatically.

https://github.com/compomics/meta-proteome-analyzer

----

Outputs
=======

MPA generates 6 tabular outputs:

* psms
* peptides
* proteins
* spectrum_ids
* metaproteins
* metaprotein_taxa
    </help>
    <citations>
        <citation type="doi">10.1021/pr501246w</citation>
    </citations>
</tool>
author	galaxyp
date	Sun, 03 Jan 2021 20:30:02 +0000
parents	c587a54c70ad
children