view msstatstmt.xml @ 1:385b1170b0a2 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/msstatstmt commit 5ac4f6093606746adb0d40fe4f73871f5934d091"
author galaxyp
date Fri, 05 Feb 2021 18:42:25 +0000
parents c431b9ddb206
children adce265c7051
line wrap: on
line source

<tool id="msstatstmt" name="MSstatsTMT" version="@TOOL_VERSION@+galaxy@GALAXY_VERSION@">
    <description>protein significance analysis in shotgun mass spectrometry-based proteomic experiments with tandem mass tag (TMT) labeling</description>
    <macros>
        <token name="@TOOL_VERSION@">1.8.0</token>
        <token name="@GALAXY_VERSION@">0</token>
        <xml name="input_options_shared">
            <param name="useUniquePeptide" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Use unique peptide" help="Yes (default) removes peptides that are assigned for more than one protein. We assume to use unique peptide for each protein." />
            <param name="rmPSM_withMissing_withinRun" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Remove PSM with missing value within run" help="Yes will remove PSM with any missing value within each run. Default is No." />
            <param name="rmPSM_withfewMea_withinRun" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Remove PSM with few measurements within run" help="Only for 'Remove PSM with missing value within run = No'. Yes (default) will remove the features that have 1 or 2 measurements within each run." />
            <param name="rmProtein_with1Feature" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Remove proteins with one feature" help="Yes will remove the proteins which have only 1 peptide and charge. Default is No." />
        </xml>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">bioconductor-msstatstmt</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        cat '$msstatstmt_script' > '$out_r_script' &&
        Rscript '$msstatstmt_script'
        && cat msstats*.log > '$out_msstats_log'
        && cat msstatstmt.log > '$out_msstatstmt_log'
    ]]></command>
    <configfiles>
        <configfile name="msstatstmt_script"><![CDATA[
library(MSstatsTMT, warn.conflicts = F, quietly = T, verbose = F)

#if $input.input_src == 'MSstatsTMT'
    input <- read.table("$input.msstatstmt_input", sep="\t", header=TRUE)

#elif $input.input_src == 'MaxQuant'
    proteinGroups.mq <- read.table("$input.proteinGroups", sep="\t", header=TRUE)
    evidence.mq <- read.table("$input.evidence", sep="\t", header=TRUE)
    annotation.mq <- read.table("$input.annotation", sep="\t", header=TRUE)

    input <- MaxQtoMSstatsTMTFormat(evidence = evidence.mq,
                                    proteinGroups = proteinGroups.mq,
                                    annotation = annotation.mq,
                                    which.proteinid = "$input.proteinID",
                                    rmProt_Only.identified.by.site = $input.input_options.rmProt_Onlyidentifiedbysite,
                                    useUniquePeptide = $input.input_options.useUniquePeptide,
                                    rmPSM_withMissing_withinRun = $input.input_options.rmPSM_withMissing_withinRun,
                                    rmPSM_withfewMea_withinRun = $input.input_options.rmPSM_withfewMea_withinRun,
                                    rmProtein_with1Feature = $input.input_options.rmProtein_with1Feature,
                                    summaryforMultipleRows = $input.input_options.summaryforMultipleRows)

#elif $input.input_src == 'OpenMS'
    input.oms <- read.table("$input.oms_input", sep="\t", header=TRUE)

    input <- OpenMStoMSstatsTMTFormat(input.oms,
                                      useUniquePeptide = $input.input_options.useUniquePeptide,
                                      rmPSM_withMissing_withinRun = $input.input_options.rmPSM_withMissing_withinRun,
                                      rmPSM_withfewMea_withinRun = $input.input_options.rmPSM_withfewMea_withinRun,
                                      rmProtein_with1Feature = $input.input_options.rmProtein_with1Feature,
                                      summaryforMultiplePSMs = $input.input_options.summaryforMultiplePSMs)
#end if

quant <- proteinSummarization(input,
                              method = "$proteinSummarization.method",
                              global_norm = $proteinSummarization.global_norm,
                              reference_norm = $proteinSummarization.reference_norm,
                              remove_norm_channel = $proteinSummarization.remove_norm_channel,
                              remove_empty_channel = $proteinSummarization.remove_empty_channel,
                              MBimpute = $proteinSummarization.MBimpute,
                              #if $proteinSummarization.maxQuantileforCensored == ''
                              maxQuantileforCensored = NULL)
                              #else
                              maxQuantileforCensored = $proteinSummarization.maxQuantileforCensored)
                              #end if

#for $plot_type in $selected_outputs
    #if $plot_type[-4:] == "Plot"
        dataProcessPlotsTMT(input,
                            quant,
                            type = '$plot_type',
                            ylimUp = $out_plots_opt.ylimUp,
                            ylimDown = $out_plots_opt.ylimDown,
                            x.axis.size = $out_plots_opt.x_axis_size,
                            y.axis.size = $out_plots_opt.y_axis_size,
                            text.size = $out_plots_opt.text_size,
                            text.angle = $out_plots_opt.text_angle,
                            legend.size = $out_plots_opt.legend_size,
                            dot.size.profile = $out_plots_opt.dot_size_profile,
                            ncol.guide = $out_plots_opt.ncol_guide,
                            width = $out_plots_opt.width,
                            height = $out_plots_opt.height,
                            #if $out_plots_opt.which_Protein.select != 'list'
                            which.Protein = "$out_plots_opt.which_Protein.select",
                            #else
                            which.Protein = unlist(read.table("$out_plots_opt.which_Protein.protein_list", sep = "\n", header = FALSE), use.names = FALSE),
                            #end if
                            originalPlot = $out_plots_opt.originalPlot,
                            summaryPlot = $out_plots_opt.summaryPlot)
    #end if
#end for

#if $group.group_comparison == 'true'
    #if $group.use_comp_matrix.select == 'true'
        comp_matrix <- read.table("$group.use_comp_matrix.comparison_matrix", sep="\t", header=TRUE, check.names=FALSE)

        comparison <- comp_matrix[,-1]
        row.names(comparison) <- as.character(comp_matrix[,1])
        comparison <- as.matrix(comparison[levels(quant\$Condition)])
    #end if

    comparisons <- groupComparisonTMT(data = quant,
                                      #if $group.use_comp_matrix.select == 'true'
                                      contrast.matrix = comparison,
                                      #end if
                                      moderated = $group.moderated,
                                      adj.method = "$group.adj_method",
                                      remove_norm_channel = $group.remove_norm_channel,
                                      remove_empty_channel = $group.remove_empty_channel)

    write.table(comparisons,
                "ComparisonResult.tsv",
                sep = "\t",
                quote = F,
                row.names = F,
                dec = ".")
#end if
        ]]></configfile>
    </configfiles>
    <inputs>
        <conditional name="input">
            <param name="input_src" type="select" label="Input Source">
                <option value="MSstatsTMT">MStatsTMT (11 column format)</option>
                <option value="MaxQuant">MaxQuant</option>
                <option value="OpenMS">OpenMS</option>
            </param>
            <when value="MSstatsTMT">
                <param name="msstatstmt_input" type="data" format="tabular" label="MStatsTMT (11 column format)"/>
            </when>
            <when value="MaxQuant">
                <param name="evidence" type="data" format="tabular" label="evidence.txt - feature-level data"/>
                <param name="proteinGroups" type="data" format="tabular" label="proteinGroups.txt" help="It needs to matching protein group ID. If proteinGroups=NULL, use 'Proteins' column in 'evidence.txt'"/>
                <param name="annotation" type="data" format="tabular" label="annotation.txt" help="Data frame which contains column Run, Fraction, TechRepMixture, Channel, Condition, BioReplicate, Mixture." />
                <param name="proteinID" type="select" label="Select Protein ID in evidence.txt">
                    <option value="Proteins">Protein column</option>
                    <option value="Leading.razor.protein">Leading razor protein column</option>
                </param>
                <section name="input_options" title="MaxQtoMSstatsTMTFormat Options" expanded="false">
                    <param name="rmProt_Onlyidentifiedbysite" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Remove proteins only identified by site" help="Yes will remove proteins with ’+’ in ’Only.identified.by.site’ column from proteinGroups.txt, which was identified only by a modification site. No is the default." />
                    <expand macro="input_options_shared"/>
                    <param name="summaryforMultipleRows" type="select" label="Summary for multiple rows" help="When there are multiple measurements for certain feature in certain run, select the feature with the largest summation or maximal value.">
                        <option value="max">max</option>
                        <option value="sum" selected="true">sum</option>
                    </param>
                </section>
            </when>
            <when value="OpenMS">
                <param name="oms_input" type="data" format="tabular" label="OpenMS input"/>
                <section name="input_options" title="OpenMStoMSstatsTMTFormat Options" expanded="false">
                    <expand macro="input_options_shared"/>
                    <param name="summaryforMultiplePSMs" type="select" label="Summary for multiple PSMs" help="When there are multiple measurements for certain feature in certain run, select the feature with the largest summation or maximal value.">
                        <option value="max">max</option>
                        <option value="sum" selected="true">sum</option>
                    </param>
                </section>
            </when>
        </conditional>
        <section name="proteinSummarization" title="proteinSummarization Options" expanded="false">
                <param name="method" type="select" multiple="false" label="Select method">
                    <option value="msstats" selected="true">msstats</option>
                    <option value="MedianPolish">MedianPolish</option>
                    <option value="Median">Median</option>
                    <option value="LogSum">LogSum</option>
                </param>
                <param name="global_norm" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Global median normalization" help="Global median normalization on peptide level data (equalizing the medians across all the channels and MS runs). Default is Yes. It will be performed before protein-level summarization."/>
                <param name="reference_norm" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Reference channel based normalization" help="Reference channel based normalization between MS runs on protein level data. Yes (default) needs at least one reference channel in each MS run, annotated by ’Norm’ in condition column. It will be performed after protein-level summarization. No will not perform this normalization step. If data only has one run, then use No"/>
                <param name="remove_norm_channel" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Remove ’Norm’ channels from protein level data."/>
                <param name="remove_empty_channel" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Remove ’Empty’ channels from protein level data"/>
                <param name="MBimpute" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="MBimpute" help="Only for 'method = msstats'. Yes (default) imputes missing values by accelerated failure time (AFT) model. No uses minimum value to impute the missing value for each peptide precursor ion."/>
                <param name="maxQuantileforCensored" type="float" optional="true" min="0" max="0.999" value="" label="Maximum quantile for deciding censored missing value" help="We assume missing values are censored. maxQuantileforCensored is maximum quantile for deciding censored missing value, for instance, 0.999. Default is empty"/>
        </section>
        <conditional name="group">
            <param name="group_comparison" type="select" label="Compare Groups">
                <option value="false">No</option>
                <option value="true">Yes</option>
            </param>
            <when value="false"/>
            <when value="true">
                <conditional name="use_comp_matrix">
                    <param name="select" type="select" label="Use comparison matrix?">
                        <option value="false">No</option>
                        <option value="true">Yes</option>
                    </param>
                    <when value="false"/>
                    <when value="true">
                        <param name="comparison_matrix" type="data" format="tabular" label="Comparison Matrix"/>
                    </when>
                </conditional>
                <param name="moderated" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Moderate t statistic" help="No (default) uses ordinary t statistic"/>
                <param name="adj_method" type="select" label="Adjusted p value method for multiple comparison">
                    <option value="holm">holm</option>
                    <option value="hochberg">hochberg</option>
                    <option value="hommel">hommel</option>
                    <option value="bonferroni">bonferroni</option>
                    <option value="BH" selected="true">BH</option>
                    <option value="BY">BY</option>
                    <option value="fdr">fdr</option>
                    <option value="none">none</option>
                </param>
                <param name="remove_norm_channel" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Remove ’Norm’ channels from protein level data"/>
                <param name="remove_empty_channel" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Remove ’Empty’ channels from protein level data"/>
            </when>
        </conditional>
        <param name="selected_outputs" type="select" multiple="true" optional="false" label="Select Outputs">
            <option value="msstats_log" selected="true">MSstats log</option>
            <option value="msstatstmt_log" selected="true">MSstatsTMT log</option>
            <option value="r_script" selected="false">MSstats Rscript</option>
            <option value="ProfilePlot" selected="false">Profile Plot</option>
            <option value="QCPlot" selected="false">QC Plot</option>
        </param>
        <section name="out_plots_opt" title="Plot Output Options" expanded="false">
            <param name="ylimUp" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Upper limit for y-axis in the log scale" help="No (Default) for Profile Plot and QC Plot uses the upper limit as rounded off maximum of log2(intensities) after normalization + 3."/>
            <param name="ylimDown" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Lower limit for y-axis in the log scale" help="No (Default) for Profile Plot and QCPlot uses 0."/>
            <param name="x_axis_size" type="integer" min="1" value="10" label="Size of x-axis labeling"/>
            <param name="y_axis_size" type="integer" min="1" value="10" label="Size of y-axis labeling"/>
            <param name="text_size" type="integer" min="1" value="4" label="Size of labels represented each condition at the top"/>
            <param name="text_angle" type="integer" min="0" max="360" value="90" label="Angle of labels represented each condition at the top"/>
            <param name="legend_size" type="integer" min="1" value="7" label="Size of legend above Profile plot"/>
            <param name="dot_size_profile" type="integer" min="1" value="2" label="Size of dots in Profile plot"/>
            <param name="ncol_guide" type="integer" min="1" value="5" label="Number of columns for legends at the top of plot"/>
            <param name="width" type="integer" min="1" value="10" label="Width of the saved pdf file"/>
            <param name="height" type="integer" min="1" value="10" label="Height of the saved pdf file"/>
            <conditional name="which_Protein">
                <param name="select" type="select" label="Select protein IDs to draw plots">
                    <option value="all" selected="true">generate all plots for each protein</option>
                    <option value="allonly">Option for QC plot: "allonly" will generate one QC plot with all proteins</option>
                    <option value="list">Protein IDs as tabular input</option>
                </param>
                <when value="all"/>
                <when value="allonly"/>
                <when value="list">
                    <param name="protein_list" type="data" format="tabular" label="List of proteins"/>
                </when>
            </conditional>
            <param name="originalPlot" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Draw original profile plots without normalization"/>
            <param name="summaryPlot" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Draw profile plots with protein summarization for each channel and MS run"/>
        </section>
    </inputs>
    <outputs>
        <data name="out_msstats_log" format="txt" label="${tool.name} on ${on_string}: MSstats log">
            <filter>'msstats_log' in selected_outputs</filter>
        </data>
        <data name="out_msstatstmt_log" format="txt" label="${tool.name} on ${on_string}: MSstatsTMT log">
            <filter>'msstatstmt_log' in selected_outputs</filter>
        </data>
        <data name="out_r_script" format="txt" label="${tool.name} on ${on_string}: Rscript">
            <filter>'r_script' in selected_outputs</filter>
        </data>
        <data name="out_profile_plot" from_work_dir="ProfilePlot.pdf" format="pdf" label="${tool.name} on ${on_string}: Profile Plot">
            <filter>'ProfilePlot' in selected_outputs</filter>
        </data>
        <data name="out_qc_plot" from_work_dir="QCPlot.pdf" format="pdf" label="${tool.name} on ${on_string}: QC Plot">
            <filter>'QCPlot' in selected_outputs</filter>
        </data>
        <data name="out_group_comp" from_work_dir="ComparisonResult.tsv" format="tsv" label="${tool.name} on ${on_string}: Group Comparison">
            <filter>group['group_comparison'] == 'true'</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="5">
            <conditional name="input">
                <param name="input_src" value="MSstatsTMT"/>
                <param name="msstatstmt_input" ftype="tabular" value="input.msstatstmt.txt"/>
            </conditional>
            <param name="selected_outputs" value="msstats_log,msstatstmt_log,r_script,ProfilePlot,QCPlot"/>
            <output name="out_msstats_log">
                <assert_contents>
                    <has_text text="1 level of Isotope type labeling in this experiment" />
                </assert_contents>
            </output>
            <output name="out_msstatstmt_log">
                <assert_contents>
                    <has_text text="MSstatsTMT - proteinSummarization function" />
                </assert_contents>
            </output>
            <output name="out_r_script">
                <assert_contents>
                    <has_n_lines n="51" />
                </assert_contents>
            </output>
            <output name="out_profile_plot" file="ProfilePlot.pdf" compare="sim_size"/>
            <output name="out_qc_plot" file="QCPlot.pdf" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <conditional name="input">
                <param name="input_src" value="MaxQuant"/>
                <param name="evidence" ftype="tabular" value="evidence.txt"/>
                <param name="annotation" ftype="tabular" value="annotation.txt"/>
                <param name="proteinGroups" ftype="tabular" value="proteinGroups.txt"/>
            </conditional>
            <conditional name="group">
                <param name="group_comparison" value="true"/>
                <conditional name="use_comp_matrix">
                    <param name="select" value="true"/>
                    <param name="comparison_matrix" ftype="tabular" value="comparison_matrix.txt"/>
                </conditional>
            </conditional>
            <param name="selected_outputs" value="ProfilePlot"/>
            <conditional name="which_Protein">
                <param name="select" value="list"/>
                <param name="protein_list" ftype="tabular" value="proteinIDs.txt"/>
            </conditional>
            <output name="out_group_comp">
                <assert_contents>
                    <has_n_lines n="21" />
                    <has_n_columns n="8" />
                    <has_text text="A0AVT1" />
                    <has_text text="A0AVT1" />
                    <has_text text="O43324" />
                </assert_contents>
            </output>
            <output name="out_profile_plot" file="ProfilePlot_list.pdf" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <conditional name="input">
                <param name="input_src" value="OpenMS"/>
                <param name="oms_input" ftype="tabular" value="input.oms.txt"/>
            </conditional>
            <conditional name="group">
                <param name="group_comparison" value="true"/>
            </conditional>
            <param name="selected_outputs" value="msstats_log"/>
            <output name="out_group_comp">
                <assert_contents>
                    <has_n_lines n="51" />
                    <has_n_columns n="8" />
                    <has_text text="Long_LF-Short_HF" />
                    <has_text text="sp|O35226|PSMD4_MOUSE" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
MSstatsTMT is an R-based package for detecting differentially abundant proteins in shotgun mass spectrometry-based proteomic experiments with tandem mass tag (TMT) labeling. It is applicable to isobaric labeling quantitative proteomics, including iTRAQ and TMT data. MSstatsTMT provides functionalities for two types of analysis: 1) Protein summarization based on peptide quantification data and visualization; 2) Model-based group comparison to detect significant changes in abundance.

**Notes**

- MSstatsTMT 11 column format: For TMT datasets an additional 'Channel' column is required.

    ::

    #>            ProteinName                               PeptideSequence
    #> 1 sp|Q60854|SPB6_MOUSE .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR
    #> 2 sp|Q60854|SPB6_MOUSE .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR
    #> 3 sp|Q60854|SPB6_MOUSE .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR
    #> 4 sp|Q60854|SPB6_MOUSE .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR
    #> 5 sp|Q60854|SPB6_MOUSE .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR
    #> 6 sp|Q60854|SPB6_MOUSE .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR
    #>   PrecursorCharge                                             PSM Mixture
    #> 1               3 .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR_3       3
    #> 2               3 .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR_3       3
    #> 3               3 .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR_3       3
    #> 4               3 .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR_3       3
    #> 5               3 .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR_3       3
    #> 6               3 .(TMT6plex)AFVEVNEEGTEAAAATAGMM(Oxidation)TVR_3       3
    #>   TechRepMixture   Run Channel BioReplicate Condition Intensity
    #> 1            3_3 3_3_3       1           21   Long_HF        NA
    #> 2            3_3 3_3_3       2           22      Norm  1068.580
    #> 3            3_3 3_3_3       3           23    Long_M  1508.330
    #> 4            3_3 3_3_3       4           24   Long_HF        NA
    #> 5            3_3 3_3_3       5           25   Long_LF  1580.951
    #> 6            3_3 3_3_3       6           26   Long_HF  1820.072

    For more information please visit the `MSstatsConvert documentation <https://bioconductor.org/packages/devel/bioc/vignettes/MSstatsConvert/inst/doc/msstats_data_format.html>`_

- Comparison matrix as tabular file

    - 1st column: name of comparison
    - additionally one column for each condition that is present in the tabular file. Use 1 and -1 to indicate the conditions to compare and 0 for conditions that are not compared. Multiple groups can be combined by using 0.5.
    - first row contains the names of the groups, they must exactly match the condition name used in the annotation file
    - each additional row represents one comparison
    - Example for a two group comparison

       ::

               names     groupA  groupB
          groupA-groupB    1      -1


    - Example for an experiment with 5 groups and 4 different comparisons

       ::

          names    G1   G2   G3   G4   G5
          G2-G1    -1    1    0    0    0
          G4-G5     0    0    0    1   -1
          G3-G5     0    0   -1    0    1
        G1+G2-G5    0.5  0.5  0    0   -1

For additional help please visit the `MSstatsTMT documentation <https://msstats.org/msstatstmt/>`_
]]>
    </help>
    <citations>
        <citation type="doi">10.1074/mcp.ra120.002105</citation>
    </citations>
</tool>