Mercurial > repos > galaxyp > calisp
comparison calisp.xml @ 0:d158a7909193 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 060699366b6dd19ad6c3ef3f332f63cc55d75dce
| author | galaxyp |
|---|---|
| date | Thu, 01 Jun 2023 08:33:58 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d158a7909193 |
|---|---|
| 1 <tool id="calisp" name="calisp" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05"> | |
| 2 <description>Estimate isotopic composition of peptides from proteomics mass spectrometry data</description> | |
| 3 <macros> | |
| 4 <token name="@TOOL_VERSION@">3.0.10</token> | |
| 5 <token name="@VERSION_SUFFIX@">0</token> | |
| 6 <token name="@CALISP_REPO@">https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242</token> | |
| 7 <xml name="input_macro" tokens="multiple"> | |
| 8 <!-- According to readme mzid input is not yet implented --> | |
| 9 </xml> | |
| 10 </macros> | |
| 11 <requirements> | |
| 12 <requirement type="package" version="@TOOL_VERSION@">calisp</requirement> | |
| 13 </requirements> | |
| 14 <command detect_errors="aggressive"><![CDATA[ | |
| 15 #import re | |
| 16 | |
| 17 mkdir -p spectra && | |
| 18 #set escaped_specs = re.sub('[^\w\-\.,:]', '_', str($spectrum_file.element_identifier)) | |
| 19 ln -s '$spectrum_file' spectra/'$escaped_specs' && | |
| 20 | |
| 21 mkdir -p psms && | |
| 22 #set escaped_peps = re.sub('[^\w\-\.,:]', '_', str($peptide_file.element_identifier)) | |
| 23 ln -s '$peptide_file' psms/'$escaped_peps' && | |
| 24 | |
| 25 calisp | |
| 26 --spectrum_file spectra/ | |
| 27 --peptide_file psms/ | |
| 28 --output_file calisp-output/ | |
| 29 --mass_accuracy $mass_accuracy | |
| 30 --bin_delimiter '$bin_delimiter' | |
| 31 --threads "\${GALAXY_SLOTS:-1}" | |
| 32 --isotope $isotope | |
| 33 $compute_clumps && | |
| 34 '$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/ | |
| 35 ]]></command> | |
| 36 <inputs> | |
| 37 <param argument="--spectrum_file" type="data" multiple="false" format="mzml" label="Spectrum file"/> | |
| 38 <param argument="--peptide_file" type="data" multiple="false" format="tabular" label="Peptide file" help="Psm file" /> | |
| 39 <param argument="--mass_accuracy" type="float" value="10" label="Mass accuracy" help="The maximum mass difference between theoretical mass and experimental mass of a peptide" /> | |
| 40 <param argument="--bin_delimiter" type="text" value="_" label="Bin delimiter" help="For metagenomic data, the delimiter that separates the bin ID from the protein ID (default: "_"). Use "-" to ignore bins ID entirely."> | |
| 41 <sanitizer invalid_char=""> | |
| 42 <valid initial="string.ascii_letters,string.digits"> | |
| 43 <add value="_" /> | |
| 44 <add value="-" /> | |
| 45 <add value=":" /> | |
| 46 </valid> | |
| 47 </sanitizer> | |
| 48 </param> | |
| 49 <param argument="--isotope" type="select" label="Target isotope"> | |
| 50 <option value="13C" selected="true">13C</option> | |
| 51 <option value="14C">14C</option> | |
| 52 <option value="15N">15N</option> | |
| 53 <option value="17O">17O</option> | |
| 54 <option value="18O">18O</option> | |
| 55 <option value="2H">2H</option> | |
| 56 <option value="3H">3H</option> | |
| 57 <option value="33S">33S</option> | |
| 58 <option value="34S">34S</option> | |
| 59 <option value="36S">36S</option> | |
| 60 </param> | |
| 61 <param argument="--compute_clumps" type="boolean" truevalue="--compute_clumps" falsevalue="" checked="false" label="Compute clumps" help="To compute clumpiness of carbon assimilation. Only use when samples are labeled tosaturation. Estimation of clumpiness takes much additional time." /> | |
| 62 </inputs> | |
| 63 <outputs> | |
| 64 <collection name="output" type="list"> | |
| 65 <discover_datasets pattern="(?P<designation>.*)\.tsv" format="tabular" directory="calisp-output"/> | |
| 66 </collection> | |
| 67 </outputs> | |
| 68 <tests> | |
| 69 <!-- TODO test data to large, avilable from here: https://github.com/kinestetika/Calisp/tree/master/test | |
| 70 if possible inlcude via location in the future | |
| 71 <test expect_num_outputs="1"> | |
| 72 <param name="spectrum_file" value="calisp_test_data.mzML" ftype="mzml"/> | |
| 73 <param name="peptide_file" value="calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/> | |
| 74 <output_collection name="output" count="1"> | |
| 75 <element name="calisp_test_data"> | |
| 76 <assert_contents> | |
| 77 <has_text text="experiment"/> | |
| 78 <has_text text="MKH_260min_1800ng"/> | |
| 79 <has_text text="HOMO"/> | |
| 80 <has_text text="P13645"/> | |
| 81 <has_text text="NHEEEMKDLR"/> | |
| 82 <has_text text="Oxidation"/> | |
| 83 <has_n_columns n="85"/> | |
| 84 <has_n_lines n="24"/> | |
| 85 </assert_contents> | |
| 86 </element> | |
| 87 </output_collection> | |
| 88 </test> | |
| 89 --> | |
| 90 </tests> | |
| 91 <help><![CDATA[ | |
| 92 Calisp (Calgary approach to isotopes in proteomics) is a program that estimates | |
| 93 isotopic composition (e.g. 13C/12C, delta13C, 15N/14N etc) of peptides from | |
| 94 proteomics mass spectrometry data. Input data consist of mzML files and files | |
| 95 with peptide spectrum matches. | |
| 96 | |
| 97 Calisp was originally developed in Java. This Galaxy tool uses the python | |
| 98 reimplementation https://github.com/kinestetika/Calisp. | |
| 99 Note that, in contrast to the Java version the python reimplementation does | |
| 100 not use ``mcl`` . | |
| 101 Compared to Java versions of calisp, the workflow has been simplified. | |
| 102 Calisp does not filter out any isotopic patterns, or adds up isotopic | |
| 103 patterns to reduce noise - like the Java version does. It simply estimates the | |
| 104 ratio for the target isotopes (e.g. 13C/12C) for every isotopic pattern it can | |
| 105 subsample. It estimates this ratio based on neutron abundance and using fast | |
| 106 fourier transforms. The former applies to stable isotope probing experiments. | |
| 107 The latter applies to natural abundances, or to isotope probing experiments with | |
| 108 very little added label (e.g. using substrates with <1% additional 13C). The | |
| 109 motivation for omitting filtering is that keeping all subsampled isotopic | |
| 110 patterns, including bad ones, will enable training of machine learning | |
| 111 classifiers. Also, because it was shown that the median provides better | |
| 112 estimates for species in microbial communities than the mean, adding up isotopic | |
| 113 patterns to improve precision has lost its purpose. There is more power (and | |
| 114 sensitivity) in numbers. | |
| 115 | |
| 116 Because no data are filtered out and no isotopic patterns get added up, | |
| 117 calisp analyzes at least ten times as many isotopic patterns compared to the | |
| 118 Java version. That means calisp.py is about ten times slower, it takes about | |
| 119 5-10 min per .mzML file on a Desktop computer. For natural | |
| 120 abundance data, it works well to only use those spectra that have a FFT fitting | |
| 121 error ("error_fft") of less than 0.001. Note that this threshold is less | |
| 122 stringent then thew one used by the java program. | |
| 123 | |
| 124 Input | |
| 125 ===== | |
| 126 | |
| 127 Calisp needs two inputs: a spectra file in mzML format and tabular peptipe file (PSM). | |
| 128 The PSM file contains a column "Spectrum File" that links the peptides to the | |
| 129 original spectra files. The mzML files are identified by the run id | |
| 130 information stored in the mzML files or the file name. | |
| 131 In order to make the association via the file name work in Galaxy one can either | |
| 132 | |
| 133 - use collections where the element identifiers are equal to the data in the column | |
| 134 - make sure that dataset names are equal to the data in this column | |
| 135 | |
| 136 Output table | |
| 137 ============ | |
| 138 | |
| 139 Each row contains one isotopic pattern, defined by the following columns: | |
| 140 | |
| 141 ========================================== =================== | |
| 142 Header name Content | |
| 143 ========================================== =================== | |
| 144 experiment filename of the peptide spectrum match (psm) file | |
| 145 ms_run filename of the .mzml file | |
| 146 bins bin/mag ids, separated by commas. Calisp expects the protein ids in the psm file to consist of two parts, separated by a delimiter (_ by default). The first part is the bin/mag id, the second part the protein id | |
| 147 proteins the ids of the proteins associated with the pattern (without the bin id) | |
| 148 peptide the aminoacid sequence of the peptide | |
| 149 peptide_mass the mass of the peptide | |
| 150 C # of carbon atoms in the peptide | |
| 151 N # of nitrogen atoms in the peptide | |
| 152 O # of oxygen atoms in the peptide | |
| 153 H # of hydrogen atoms in the peptide | |
| 154 S # of sulfur atoms in the peptide | |
| 155 psm_id psm id | |
| 156 psm_mz psm m over z | |
| 157 psm_charge psm charge | |
| 158 psm_neutrons number of neutrons inferred from custom 'neutron' modifications | |
| 159 psm_rank rank of the psm | |
| 160 psm_precursor_id id of the ms1 spectrum that was the source of the psm | |
| 161 psm_precursor_mz mass over charge of the precursor of the psm | |
| 162 pattern_charge charge of the pattern | |
| 163 pattern_precursor_id id of the ms1 spectrum that was the source of the pattern | |
| 164 pattern_total_intensity total intensity of the pattern | |
| 165 pattern_peak_count # of peaks in the pattern | |
| 166 pattern_median_peak_spacing medium mass difference between a pattern's peaks | |
| 167 spectrum_mass_irregularity a measure for the standard deviation in the mass difference between a pattern's peaks | |
| 168 ratio_na the estimated isotope ratio inferred from neutron abundance (sip experiments) | |
| 169 ratio_fft the estimated isotope ratio inferred by the fft method (natural isotope abundances) | |
| 170 error_fft the remaining error after fitting the pattern with fft | |
| 171 error_clumpy the remaining error after fitting the pattern with the clumpy carbon method | |
| 172 flag_peptide_contains_sulfur true if peptide contains sulfur | |
| 173 flag_peptide_has_modifications true if peptide has no modifications | |
| 174 flag_peptide_assigned_to_multiple_bins true if peptide is associated with multiple proteins from different bins/mags | |
| 175 flag_peptide_assigned_to_multiple_proteins true if peptide is associated with multiple proteins | |
| 176 flag_peptide_mass_and_elements_undefined true if peptide has unknown mass and elemental composition | |
| 177 flag_psm_has_low_confidence true if psm was flagged as having low confidence (peptide identity uncertain) | |
| 178 flag_psm_is_ambiguous true if psm could not be assigned with certainty | |
| 179 flag_pattern_is_contaminated true if multiple patterns have one or more shared peaks | |
| 180 flag_pattern_is_wobbly true if pattern_median_peak_spacing exceeds a treshold | |
| 181 flag_peak_at_minus_one_pos true if a peak was detected immediately before the monoisotopic peak, could indicate overlap with another pattern | |
| 182 i0 - i19 the intensities of the first 20 peaks of the pattern | |
| 183 m0 - m19 the masses of the first 20 peaks of the pattern | |
| 184 c1 - c6 contributions of clumps of 1-6 carbon to ratio_na. These are the outcomes of the clumpy carbon model. These results are only meaningful if the biomass was labeled to saturation. | |
| 185 ========================================== =================== | |
| 186 ]]></help> | |
| 187 <citations> | |
| 188 <citation type="doi">10.1186/s40168-022-01454-1</citation> | |
| 189 <citation type="doi">10.1073/pnas.1722325115</citation> | |
| 190 <citation type="doi">10.1101/2021.03.29.437612</citation> | |
| 191 <citation type="doi">10.1093/bioinformatics/bty046</citation> | |
| 192 </citations> | |
| 193 </tool> |
