Mercurial > repos > pieterlukasse > prims_metabolomics
changeset 1:071a185c2ced
new tools
line wrap: on
line diff
--- a/README.rst Thu Jan 16 13:22:38 2014 +0100 +++ b/README.rst Fri Oct 24 12:52:56 2014 +0200 @@ -19,6 +19,22 @@ ============== ====================================================================== Date Changes -------------- ---------------------------------------------------------------------- +September 2014 * Added new membership cutoff option for final clusters in MsClust + * Improved MsClust memory usage for large datasets + * Simplified MsClust HTML report + * Added option for microminutes based clustering instead of scannr + based in MsClust +April 2014 * Added interface to ExactMassDB, Pep1000, KEGG, KNApSAcK, Flavonoid + Viewer, LipidMAPS, HMDB, PubChem, by using the service MFSearcher. + This enables users to query multiple public repositories for + elemental compositions from accurate mass values detected by + high-resolution mass spectrometers. NB: see also added + licensing info below. +March 2014 * Added interface to METEXP data store, including tool to fire + queries in batch mode + * Improved quantification output files of MsClust, a.o. sorting + mass list based on intensity (last two columns of quantification + files) January 2014 * first release via Tool Shed, combining the RIQC and MsClust in a single package (this package) * integration with METEXP software (data store for metabolomics @@ -68,4 +84,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - \ No newline at end of file + + +License for third party services +================================ +MFSearcher service : http://webs2.kazusa.or.jp/mfsearcher/#090 +In the MFSearcher system, the compound data provided by KEGG, Flavonoid Viewer, LIPID MAPS, HMDB and PubChem +were downloaded for academic purposes. The compound data of KNApSAcK is provided by Prof. Kanaya in +Nara Institute of Science and Technology (NAIST). The part of these data are utilized to construct the +specified databases for rapid mass searching in the MFSearcher system after re-calculating the molecular weights. +Please preserve the contracts of each original databases when utilizing the search results against these +databases by MFSearcher. + +The searching system of MFSearcher, the ExactMassDB database, and the Pep1000 database by Kazusa DNA +Research Institute is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported License. \ No newline at end of file
--- a/__init__.py Thu Jan 16 13:22:38 2014 +0100 +++ b/__init__.py Fri Oct 24 12:52:56 2014 +0200 @@ -1,6 +1,6 @@ -''' -Module containing Galaxy tools for the GC/MS pipeline -Created on Mar 6, 2012 - -@author: marcelk -''' +''' +Module containing Galaxy tools for the LC or GC/MS pipeline +Created on Mar , 2014 + +@author: pieter lukasse +''' \ No newline at end of file
--- a/combine_output.py Thu Jan 16 13:22:38 2014 +0100 +++ b/combine_output.py Fri Oct 24 12:52:56 2014 +0200 @@ -155,12 +155,16 @@ @param data: dictionary containing merged dataset @param out_csv: output csv file ''' - header = ['Centrotype', + # Columns we don't repeat: + header_part1 = ['Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)', 'nr. Peaks', - 'R.T.', + 'R.T.'] + # These are the headers/columns we repeat in case of + # combining hits in one line (see alternative_headers method below): + header_part2 = [ 'Name', 'FORMULA', 'Library', @@ -190,13 +194,21 @@ output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") # Write headers - output_single_handle.writerow(header) - output_multi_handle.writerow(header * nhits) + output_single_handle.writerow(header_part1 + header_part2) + output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) # Combine all hits for each centrotype into one line line = [] for centrotype_idx in xrange(len(data)): + i = 0 for hit in data[centrotype_idx]: - line.extend(hit) + if i==0: + line.extend(hit) + else: + line.extend(hit[6:]) + i = i+1 + # small validation (if error, it is a programming error): + if i > nhits: + raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) output_multi_handle.writerow(line) line = [] @@ -205,6 +217,17 @@ for hit in data[centrotype_idx]: output_single_handle.writerow(hit) +def alternative_headers(header_part2, nr_alternative_hits): + ''' + This method will iterate over the header names and add the string 'ALT#_' before each, + where # is the number of the alternative, according to number of alternative hits we want to add + to final csv/tsv + ''' + result = [] + for i in xrange(nr_alternative_hits): + for header_name in header_part2: + result.append("ALT" + str(i+1) + "_" + header_name) + return result def main(): '''
--- a/combine_output.xml Thu Jan 16 13:22:38 2014 +0100 +++ b/combine_output.xml Fri Oct 24 12:52:56 2014 +0200 @@ -4,10 +4,11 @@ combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi </command> <inputs> - <param format="tabular" name="caslookup_in" type="data" label="RIQC-Lookup RI for CAS output" + <param format="tabular" name="rankfilter_in" type="data" label="RIQC-RankFilter output (Estimated RI)" + help="Select the output file from the RankFilter tool"/> + <param format="tabular" name="caslookup_in" type="data" label="RIQC-Lookup RI for CAS output ('Known' RI)" help="Select the output file from the CasLookup tool"/> - <param format="tabular" name="rankfilter_in" type="data" label="RIQC-RankFilter output" - help="Select the output file from the RankFilter tool"/> + <!-- <param TODO : could add "tolerance for ERI-KRI"(Estimated RI-Known RI)--> </inputs> <outputs> <data format="tabular" label="${tool.name} (Single) on ${on_string}" name="out_single" />
--- a/datatypes_conf.xml Thu Jan 16 13:22:38 2014 +0100 +++ b/datatypes_conf.xml Fri Oct 24 12:52:56 2014 +0200 @@ -3,9 +3,6 @@ <datatype_files> </datatype_files> <registration display_path="display_applications"> - <!-- type for the pdf --> - <datatype extension="pdf" type="galaxy.datatypes.data:Data" mimetype="application/octet-stream" - display_in_upload="true" subclass="true"/> <datatype extension="msclust.csv" type="galaxy.datatypes.tabular:Tabular" mimetype="text/csv" display_in_upload="true" subclass="true"> </datatype> </registration>
--- a/export_to_metexp_tabular.py Thu Jan 16 13:22:38 2014 +0100 +++ b/export_to_metexp_tabular.py Fri Oct 24 12:52:56 2014 +0200 @@ -5,17 +5,18 @@ into a tabular file that can be uploaded to the MetExp database. RankFilter, CasLookup are already combined by combine_output.py so here we will use -this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust -quantification files are to be combined with combine_output.py result as well. +this result. Furthermore here one of the MsClust +quantification files containing the respective spectra details are to be combined as well. Extra calculations performed: - The column MW is also added here and is derived from the column FORMULA found - in combine_output.py result. + in RankFilter, CasLookup combined result. -So in total here we merge 3 files and calculate one new column. +So in total here we merge 2 files and calculate one new column. ''' - +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 import csv +import re import sys from collections import OrderedDict @@ -40,14 +41,15 @@ ONE_TO_ONE = 'one_to_one' N_TO_ONE = 'n_to_one' -def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE): +def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE): ''' Merges data from both input dictionaries based on the link fields. This method will build up a new list containing the merged hits as the items. @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) ''' - # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values): + # TODO test for correct input files -> same link_field values should be there + # (test at least number of unique link_field values): # # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): # raise Exception('input files should have the same nr of key values ') @@ -64,17 +66,23 @@ # Get the indices for current link_field_set1_value in both data-structures for proper matching set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] - - + # Validation : + if len(set2index) == 0: + # means that corresponding data could not be found in set2, then throw error + raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + + link_field_set1_value + " only found in first dataset. ") merged_hits = [] # Combine hits for hit in xrange(len(set1index)): # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its - # corresponding value in the rankfilter or caslookup tables; i.e. - # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute - # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) + # corresponding value in the sets; i.e. + # set1[key] => returns the list/array with size = nrrows, with the values for the attribute + # represented by "key". + # set1index[hit] => points to the row nr=hit (hit is a rownr/index) + # So set1[x][set1index[n]] = set1.attributeX.instanceN + # # It just ensures the entry is made available as a plain named array for easy access. rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) if relation_type == ONE_TO_ONE : @@ -83,7 +91,7 @@ # is N to 1: cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) - merged_hit = merge_function(rf_record, cl_record) + merged_hit = merge_function(rf_record, cl_record, metadata) merged_hits.append(merged_hit) merged.append(merged_hits) @@ -103,29 +111,62 @@ -def _merge_records(rank_caslookup_combi, msclust_quant_record): +def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata): ''' Combines single records from both the RankFilter+CasLookup combi file and from MsClust file @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) @param msclust_quant_record: msclust quantification + spectrum record ''' - i = 0 record = [] for column in rank_caslookup_combi: record.append(rank_caslookup_combi[column]) - i += 1 for column in msclust_quant_record: record.append(msclust_quant_record[column]) - i += 1 + + for column in metadata: + record.append(metadata[column]) + + # add MOLECULAR MASS (MM) + molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA']) + # limit to two decimals: + record.append("{0:.2f}".format(molecular_mass)) + + # add MOLECULAR WEIGHT (MW) - TODO - calculate this + record.append('0.0') + + # level of identification and Location of reference standard + record.append('0') + record.append('') return record - +def get_molecular_mass(formula): + ''' + Calculates the molecular mass (MM). + E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O + ''' + + # Each element is represented by a capital letter, followed optionally by + # lower case, with one or more digits as for how many elements: + element_pattern = re.compile("([A-Z][a-z]?)(\d*)") -def _save_data(data, headers, nhits, out_csv): + total_mass = 0 + for (element_name, count) in element_pattern.findall(formula): + if count == "": + count = 1 + else: + count = int(count) + element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?) + total_mass += element_mass * count + + return total_mass + + + +def _save_data(data, headers, out_csv): ''' Writes tab-separated data to file @param data: dictionary containing merged dataset @@ -139,12 +180,35 @@ # Write headers output_single_handle.writerow(headers) - # Write one line for each centrotype - for centrotype_idx in xrange(len(data)): - for hit in data[centrotype_idx]: + # Write + for item_idx in xrange(len(data)): + for hit in data[item_idx]: output_single_handle.writerow(hit) +def _get_map_for_elements_and_masses(elements_and_masses): + ''' + This method will read out the column 'Chemical symbol' and make a map + of this, storing the column 'Relative atomic mass' as its value + ''' + resultMap = {} + index = 0 + for entry in elements_and_masses['Chemical symbol']: + resultMap[entry] = elements_and_masses['Relative atomic mass'][index] + index += 1 + + return resultMap + + +def init_elements_and_masses_map(): + ''' + Initializes the lookup map containing the elements and their respective masses + ''' + elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab")) + global elements_and_masses_map + elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses) + + def main(): ''' Combine Output main function @@ -156,15 +220,27 @@ rankfilter_and_caslookup_combined_file = sys.argv[1] msclust_quantification_and_spectra_file = sys.argv[2] output_csv = sys.argv[3] + # metadata + metadata = OrderedDict() + metadata['organism'] = sys.argv[4] + metadata['tissue'] = sys.argv[5] + metadata['experiment_name'] = sys.argv[6] + metadata['user_name'] = sys.argv[7] + metadata['column_type'] = sys.argv[8] # Read RankFilter and CasLookup output files rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') + # Read elements and masses to use for the MW/MM calculation : + init_elements_and_masses_map() + merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', - msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE) - headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() - _save_data(merged, headers, nhits, output_csv) + msclust_quantification_and_spectra, 'centrotype', + _compare_records, _merge_records, metadata, + N_TO_ONE) + headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard'] + _save_data(merged, headers, output_csv) if __name__ == '__main__':
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export_to_metexp_tabular.xml Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,75 @@ +<tool id="export_to_metexp_tabular" + name="METEXP - Tabular file" + version="0.2.0"> + <description>Create tabular file for loading into METabolomics EXPlorer database</description> + <command interpreter="python"> + export_to_metexp_tabular.py + $rankfilter_and_caslookup_combi + $msclust_quant_file + $output_result + "$organism" + "$tissue" + "$experiment_name" + "$user_name" + "$column_type" + </command> + <inputs> + <param format="tabular" name="rankfilter_and_caslookup_combi" type="data" label="RIQC-Combine RankFilter and CasLookup output" + help="Select the (multi) output file from the 'Combine RankFilter and CasLookup' tool"/> + <param format="tabular" name="msclust_quant_file" type="data" label="MusClust-quantification file output" + help="Select the output file from MsClust (centrotype, mic or sim) which also contain respective spectrum details"/> + + + <param name="organism" type="text" size="80" + label="Organism(s) info" + help="Metadata information to accompany the results when stored in MetExp DB." > + <validator type="empty_field" message="A value is required."></validator><!-- attribute optional="False" does not seem to work for params so validator is added --> + </param> + + <param name="tissue" type="text" size="80" + label="Tissue(s) info" + help="Metadata information to accompany the results when stored in MetExp DB." > + <validator type="empty_field" message="A value is required."></validator> + </param> + + <param name="experiment_name" type="text" size="80" + label="Experiment name/code" + help="Name or code to store the results under. This can help you find the results back in MetExpDB." > + <validator type="empty_field" message="A value is required."></validator> + </param> + + <param name="user_name" type="text" size="80" + label="User name" + help="User name or code to store the results under. This can help you find the results back in MetExpDB." > + <validator type="empty_field" message="A value is required."></validator> + </param> + + <param name="column_type" type="text" size="80" + label="Column type" + help="Column type to report with the results. This can help you find the results back in MetExpDB." > + <validator type="empty_field" message="A value is required."></validator> + </param> + + </inputs> + <outputs> + <data format="tabular" label="${tool.name} on ${on_string}" name="output_result" /> + </outputs> + <help> +.. class:: infomark + +Tool to combine output from the tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database. + +RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use +this result. + +**Notes** + +Extra calculations performed: +- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result. + +So in total here we merge 2 files and calculate one new column. + + + </help> +</tool>
--- a/library_lookup.xml Thu Jan 16 13:22:38 2014 +0100 +++ b/library_lookup.xml Fri Oct 24 12:52:56 2014 +0200 @@ -13,16 +13,22 @@ $regression.model </command> <inputs> + <!-- Regarding the <page> items: this blocks the use of this tool in Galaxy workflows. However, + alternatives like wrapping this in conditionals, repeats (to force a refresh_on_change as this option + is not working on its own) failed since the workflow editor does not support refreshes...not does the + workflow runtime support conditionals or repeats to be set at runtime. See also + galaxy-dev mail thread "when else" in <conditional> ? RE: refresh_on_change : is this a valid attribute? Any other ideas/options??" --> <page> <param format="tabular" name="input" type="data" label="NIST identifications as tabular file" help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" /> <param name="library_file" type="select" label="CAS x RI Library file" help="Select a library/lookup file containing RI values for CAS numbers on various chromatography columns " dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/> + </page> + <page> <param name="col_type" type="select" label="Select column type" refresh_on_change="true" display="radio" dynamic_options='get_column_type(library_file)' help="" /> - </page> <page> <param name="polarity" type="select" label="Select polarity" refresh_on_change="true" display="radio" dynamic_options='filter_column(library_file,col_type)'
--- a/match_library.py Thu Jan 16 13:22:38 2014 +0100 +++ b/match_library.py Fri Oct 24 12:52:56 2014 +0200 @@ -17,15 +17,19 @@ GC-column types. Used by the library_lookup.xml tool @param library_file: given library file from which the list of GC-column types is extracted ''' - (data, header) = read_library(library_file) - - if 'columntype' not in header: - raise IOError('Missing columns in ', library_file) - - # Filter data on column type - column_type = header.index("columntype") - amounts_in_list_dict = count_occurrence([row[column_type] for row in data]) - galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()] + if library_file == "": + galaxy_output = [("", "", False)] + else: + (data, header) = read_library(library_file) + + if 'columntype' not in header: + raise IOError('Missing columns in ', library_file) + + # Filter data on column type + column_type = header.index("columntype") + amounts_in_list_dict = count_occurrence([row[column_type] for row in data]) + galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()] + return(galaxy_output) @@ -35,19 +39,23 @@ @param library_file: file containing the database @param column_type_name: column type to filter on ''' - (data, header) = read_library(library_file) - - if ('columntype' not in header or - 'columnphasetype' not in header): - raise IOError('Missing columns in ', library_file) - - column_type = header.index("columntype") - statphase = header.index("columnphasetype") - - # Filter data on colunn type name - statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name] - amounts_in_list_dict = count_occurrence(statphase_list) - galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] + if library_file == "": + galaxy_output = [("", "", False)] + else: + (data, header) = read_library(library_file) + + if ('columntype' not in header or + 'columnphasetype' not in header): + raise IOError('Missing columns in ', library_file) + + column_type = header.index("columntype") + statphase = header.index("columnphasetype") + + # Filter data on colunn type name + statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name] + amounts_in_list_dict = count_occurrence(statphase_list) + galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] + return(sorted(galaxy_output)) @@ -58,22 +66,26 @@ @param column_type_name: column type to filter on @param statphase: stationary phase of the column to filter on ''' - (data, header) = read_library(library_file) - - if ('columntype' not in header or - 'columnphasetype' not in header or - 'columnname' not in header): - raise IOError('Missing columns in ', library_file) - - column_type_column = header.index("columntype") - statphase_column = header.index("columnphasetype") - column_name_column = header.index("columnname") - - # Filter data on given column type name and stationary phase - statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and - line[statphase_column] == statphase] - amounts_in_list_dict = count_occurrence(statphase_list) - galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] + if library_file == "": + galaxy_output = [("", "", False)] + else: + (data, header) = read_library(library_file) + + if ('columntype' not in header or + 'columnphasetype' not in header or + 'columnname' not in header): + raise IOError('Missing columns in ', library_file) + + column_type_column = header.index("columntype") + statphase_column = header.index("columnphasetype") + column_name_column = header.index("columnname") + + # Filter data on given column type name and stationary phase + statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and + line[statphase_column] == statphase] + amounts_in_list_dict = count_occurrence(statphase_list) + galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] + return(sorted(galaxy_output)) @@ -96,9 +108,10 @@ fill a Galaxy drop-down combo box. ''' - files = glob.glob(dir_name + "/*.txt") + files = glob.glob(dir_name + "/*.*") if len(files) == 0: - raise Exception("Configuration error: no library files found in <galaxy-home-dir>/" + dir_name) + # Configuration error: no library files found in <galaxy-home-dir>/" + dir_name : + galaxy_output = [("Configuration error: expected file not found in <galaxy-home-dir>/" + dir_name, "", False)] else: galaxy_output = [(str(get_file_name_no_ext(file_name)), str(os.path.abspath(file_name)), False) for file_name in files] return(galaxy_output)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS_cmd_interface.r Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,99 @@ +## read args: +args <- commandArgs(TRUE) +## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData" +args.constructedDB <- args[1] +## data files, e.g. "E:/Rworkspace/metaMS/data/data.zip" (with e.g. .CDF files) and unzip output dir, e.g. "E:/" +args.dataZip <- args[2] +args.zipExtrDir <- args[3] +## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" +args.settings <- args[4] + +## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" +args.outAnnotationTable <- args[5] +args.outLogFile <- args[6] +args.xsetOut <- args[7] + +## report files +args.htmlReportFile <- args[8] +args.htmlReportFile.files_path <- args[9] + +# Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 +msg <- file(args.outLogFile, open="wt") +sink(msg, type="message") +sink(msg, type="output") + +cat("\nSettings used===============:\n") +cat(readChar(args.settings, 1e5)) + + +tryCatch( + { + library(metaMS) + + ## load the constructed DB : + tempEnv <- new.env() + testDB <- load(args.constructedDB, envir=tempEnv) + + ## load the data files from a zip file + files <- unzip(args.dataZip, exdir=args.zipExtrDir) + + ## load settings "script" into "customMetaMSsettings" + source(args.settings, local=tempEnv) + message(paste(" loaded : ", args.settings)) + + # Just to highlight: if you want to use more than one + # trigger runLC: + LC <- runLC(files, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, nSlaves=20, returnXset = TRUE) + + # write out runLC annotation results: + write.table(LC$Annotation$annotation.table, args.outAnnotationTable, sep="\t", row.names=FALSE) + + # the used constructed DB (write to log): + cat("\nConstructed DB info===============:\n") + str(tempEnv[[testDB[1]]]$Info) + cat("\nConstructed DB table===============:\n") + write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE) + write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE) + # save xset as rdata: + xsetData <- LC$xset@xcmsSet + saveRDS(xsetData, file=args.xsetOut) + + message("\nGenerating report.........") + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE) + setwd(file.path(args.htmlReportFile.files_path)) + html <- "<html><body><h1>Extracted Ion Chromatograms of groups with more than 3 peaks</h1>" + + LC$xset@xcmsSet + gt <- groups(LC$xset@xcmsSet) + colnames(gt) + groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3) + if (length(groupidx1) > 0) + { + eiccor <- getEIC(LC$xset@xcmsSet, groupidx = c(groupidx1)) + eicraw <- getEIC(LC$xset@xcmsSet, groupidx = c(groupidx1), rt = "raw") + for (i in 1:length(groupidx1)) + { + figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") + html <- paste(html,"<img src='", "figure", i,".png' />", sep="") + png( figureName ) + plot(eiccor, LC$xset@xcmsSet, groupidx = i) + devname = dev.off() + } + } + + + html <- paste(html,"</body><html>") + message("finished generating report") + write(html,file=args.htmlReportFile) + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metams_lcms_annotate.xml Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,138 @@ +<tool id="metams_lcms_annotate" name="METAMS-LC/MS Annotate" version="0.0.1"> + <description> Runs metaMS process for LC/MS feature grouping and annotation</description> + <requirements> + <!-- <requirement type="set_environment">XTANDEM_12_10_01_PATH</requirement>--> + <requirement type="package" version="3.1.1">R_bioc_metams</requirement> + </requirements> + <command interpreter="/home/lukas007/R/R-3.1.1/bin/Rscript"> + metaMS_cmd_interface.r + $constructed_db + $data_files + $data_files"dir/" + $customMetaMSsettings + $outputFile + $outputLog + $xsetOut + $htmlReportFile + $htmlReportFile.files_path + </command> +<inputs> + <param name="constructed_db" type="select" label="Constructed DB" help="Reference annotation database generated from matching measurements of a mixture of chemical standards + against a manually validated reference table which contains the key analytical information for each standard." + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/metaMS")'/> + + <param name="data_files" type="data" format="prims.fileset.zip" label="Data files (.zip file with CDFs)" help=".zip file containing the CDF files of the new measurements"/> + + + + <param name="protocolName" type="text" size="30" label="protocolName" value="Synapt.QTOF.RP" help="protocolName"/> + + <param name="method" type="select" size="30" label="PEAK PICKING method ====================================================="> + <option value="matchedFilter" selected="true">matchedFilter</option> + </param> + <param name="step" type="float" size="10" value="0.05" label="step" help="step"/> + <param name="fwhm" type="integer" size="10" value="20" label="fwhm" help="fwhm" /> + <param name="snthresh" type="integer" size="10" value="4" label="snthresh" help="snthresh" /> + <param name="max" type="integer" size="10" value="50" label="max" help="max" /> + + <param name="min_class_fraction" type="float" size="10" value="0.3" label="ALIGNMENT min.class.fraction =====================================================" help="min.class.fraction"/> + <param name="min_class_size" type="integer" size="10" value="3" label="min.class.size" help="min.class.size" /> + <param name="mzwid" type="float" size="10" value="0.1" label="mzwid" help="mzwid"/> + <param name="bws" type="text" size="10" value="30,10" label="bws" help="bws"/> + <param name="missingratio" type="float" size="10" value="0.2" label="missingratio" help="missingratio"/> + <param name="extraratio" type="float" size="10" value="0.1" label="extraratio" help="extraratio"/> + <param name="retcormethod" type="select" size="30" label="retcormethod" help="retcormethod"> + <option value="linear" selected="true">linear</option> + </param> + <param name="retcorfamily" type="select" size="30" label="retcorfamily" help="retcorfamily"> + <option value="symmetric" selected="true">symmetric</option> + </param> + <param name="fillPeaks" type="select" size="30" label="fillPeaks" help="fillPeaks"> + <option value="TRUE" selected="true">Yes</option> + <option value="FALSE">No</option> + </param> + <param name="perfwhm" type="float" size="10" value="0.6" label="CAMERA perfwhm =====================================================" help="perfwhm"/> + <param name="cor_eic_th" type="float" size="10" value="0.7" label="cor_eic_th" help="cor_eic_th" /> + <param name="ppm" type="float" size="10" value="5.0" label="ppm" help="ppm" /> + <param name="rtdiff" type="float" size="10" value="1.5" label="MATCH2DB rtdiff =====================================================" help="rtdiff"/> + <param name="rtval" type="float" size="10" value="0.1" label="rtval" help="rtval" /> + <param name="mzdiff" type="float" size="10" value="0.005" label="mzdiff" help="mzdiff" /> + <param name="match2DB_ppm" type="float" size="10" value="5.0" label="ppm" help="ppm" /> + <param name="minfeat" type="integer" size="10" value="2" label="minfeat" help="minfeat" /> + +</inputs> +<configfiles> + +<configfile name="customMetaMSsettings">## start comment + ## metaMS process settings + customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}", + chrom = "LC", + PeakPicking = list( + method = "${method}", + step = ${step}, + fwhm = ${fwhm}, + snthresh = ${snthresh}, + max = ${max}), + Alignment = list( + min.class.fraction = ${min_class_fraction}, + min.class.size = ${min_class_size}, + mzwid = ${mzwid}, + bws = c(${bws}), + missingratio = ${missingratio}, + extraratio = ${extraratio}, + retcormethod = "${retcormethod}", + retcorfamily = "${retcorfamily}", + fillPeaks = ${fillPeaks}), + CAMERA = list( + perfwhm = ${perfwhm}, + cor_eic_th = ${cor_eic_th}, + ppm= ${ppm})) +metaSetting(customMetaMSsettings, "match2DB") <- list( + rtdiff = ${rtdiff}, + rtval = ${rtval}, + mzdiff = ${mzdiff}, + ppm = ${match2DB_ppm}, + minfeat = ${minfeat})</configfile> + +</configfiles> + +<outputs> + <data name="outputFile" format="tabular" label="${tool.name} on ${on_string} - metaMS annotated file (TSV)"/> + <data name="outputLog" format="txt" label="${tool.name} on ${on_string} - metaMS LOG"/> + <data name="xsetOut" format="rdata" label="${tool.name} on ${on_string} - metaMS xcmsSet (RDATA)"/> + <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - metaMS report (HTML)"/> +</outputs> +<tests> + <test> + </test> +</tests> +<code file="match_library.py" /> <!-- file containing get_directory_files function used above--> +<help> + +.. class:: infomark + +Runs metaMS process for LC/MS feature grouping and annotation. Parts of the metaMS process also make use of the XCMS and CAMERA tools and algorithms. +The figure below shows the main parts of the metaMS process. + +.. image:: $PATH_TO_IMAGES/metaMS.png + + +**References** + +If you use this Galaxy tool in work leading to a scientific publication please +cite the following papers: + +Wehrens, R.; Weingart, G.; Mattivi, F. (2014). +metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics. +Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116. +doi: 10.1016/j.jchromb.2014.02.051 +handle: http://hdl.handle.net/10449/24012 + + + </help> + <citations> + <citation type="doi">10.1016/j.jchromb.2014.02.051</citation> <!-- example + see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set + --> + </citations> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/msclust.xml Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,341 @@ +<tool name="MsClust" id="msclust2" version="2.0.5"> + <description>Extracts fragmentation spectra from aligned data</description> + <!-- + For remote debugging start you listener on port 8000 and use the following as command interpreter: + java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 + ////////////////////////// + + TODO in command below: add conditionals according to options of using or NOT the tolerances/thresholds from previous steps + --> + <command interpreter="java -jar "> + MsClust.jar + -peaksFileName $inputPeaks + -dataType $dataType + -imputationMethod $imputationMethod.type + #if $imputationMethod.type == "valueRange" + -rangeUpperLimit $imputationMethod.rangeUpperLimit + #end if + -plInputFormat "metalign" + -potDensFuncType $potDensFuncType.type + -centerSelectionType $centerSelectionType.type + -clusteringType $clusteringType.type + -neighborhoodWindowSize $potDensFuncType.pdf_neighborhoodWindowSize + -clusterSearchStopCriterium $centerSelectionType.cs_stop_criterion + -pearsonDistTreshold $potDensFuncType.pdf_pears_treshold + -pearsonTresholdConfidence $potDensFuncType.pdf_pears_conf + -pearsonPDReductionThreshold $centerSelectionType.cs_pears_pd_reductionTreshold + -pearsonPDReductionSlope $centerSelectionType.cs_pears_pd_reductionSlope + -rtDistTolUnit $potDensFuncType.rt_dist_tol_unit.type + -rtDistTol $potDensFuncType.rt_dist_tol_unit.pdf_rt_toler + -rtDistanceConfidence $potDensFuncType.pdf_scan_conf + #if $clusteringType.type == "original" + -clustMembershipCutoff $clusteringType.clust_membership_cutoff + #end if + -centrotypesOut $centrotypesOut + -simOut $simOut + -micOut $micOut + -mspOut $mspOut + -classOut $classOut + -outReport $htmlReportFile + -outReportPicturesPath $htmlReportFile.files_path + #if $clusteringType.type == "fuzzyCMeans" + -fcmMembershipWeightingExponent $clusteringType.fcmMembershipWeightingExponent + -fcmStopCriterion $clusteringType.fcmStopCriterion + -fcmCorrelationWeight $clusteringType.fcmCorrelationWeight + -fcmFinalAssemblyType $clusteringType.finalClusterAssembly.type + #if $clusteringType.finalClusterAssembly.type == "membershipBased" + -fcmMembershipCutoff $clusteringType.finalClusterAssembly.fcmMembershipCutoff + #end if + #end if + -verbose "false" + #if $advancedSettings.settings == True + -advancedSettings YES + -saturationLimit $advancedSettings.saturationLimit + -sampleSelectionSortType $advancedSettings.sampleSelectionSortType + -simSelectionAlgorithm $advancedSettings.simSelectionAlgorithm + -simMassFilter "$advancedSettings.simMassFilter" + -simMembershipThreshold $advancedSettings.simMembershipThreshold + -simSaturationThreshold $advancedSettings.simSaturationThreshold + -simAbsenseThreshold $advancedSettings.simAbsenseThreshold + -micMembershipThreshold $advancedSettings.micMembershipThreshold + -peakIntensityCorrectionAlgorithm $advancedSettings.peakIntensityCorrectionAlgorithm + #else + -advancedSettings YES + -sampleSelectionSortType SIM_INTENSITY + -peakIntensityCorrectionAlgorithm CORRELATION_BASED + #end if + + </command> + <inputs> + + <param name="inputPeaks" type="data" format="txt" label="Ion-wise aligned data (e.g. MetAlign output data)" /> + <param name="dataType" type="select" size="30" label="Data type"> + <option value="gcms" selected="true">GC-MS</option> + <option value="lcms">LC-MS</option> + </param> + <conditional name="imputationMethod"> + <param name="type" type="select" size="30" label="Select the approach used for imputing missing values (optional)" help="select how you generated the values to fill in the data gaps"> + <option value="none" >none</option> + <option value="metot" selected="true">MeTot</option> + <option value="valueRange">Values range</option> + </param> + <when value="valueRange"> + <param name="rangeUpperLimit" type="integer" size="10" value="0" label="Range upper limit" help="values up to this limit will be considered 'generated' values" /> + </when> + <when value="metot"> + </when> + <when value="none"> + </when> + </conditional> + <conditional name="potDensFuncType"> + <param name="type" type="select" size="30" label="Select PD function type ====================================================="> + <option value="original" selected="true">Original</option> + </param> + <when value="original"> + <param name="pdf_neighborhoodWindowSize" type="integer" size="10" value="200" label="Effective Peaks" /> + <conditional name="rt_dist_tol_unit"> + <param name="type" type="select" size="30" label="Peak time unit"> + <option value="1" selected="true">scan nr</option> + <option value="2" >(average) micro minutes</option> + </param> + <when value="1"> + <param name="pdf_rt_toler" type="float" size="10" value="10" label="Peak Width, in scans" /> + </when> + <when value="2"> + <param name="pdf_rt_toler" type="float" size="10" value="100000" label="Peak Width, in micro minutes" help="e.g. 100,000=6 seconds" /> + </when> + </conditional> + <param name="pdf_scan_conf" type="float" size="10" value="80" label="Peak Width confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" /> + <param name="pdf_pears_treshold" type="float" size="10" value="0.8" label="Correlation threshold (0.0 - 1.0)" /> + <param name="pdf_pears_conf" type="float" size="10" value="98.0" label="Correlation threshold confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" /> + </when> + </conditional> + <conditional name="centerSelectionType"> + <param name="type" type="select" label="Initial Centers selection type ==================================================" > + <option value="original" selected="true">Original - Subtractive potential reductions with stop criterion and REUSE tolerances (from PD function)</option> + </param> + <when value="original"> + <param name="cs_pears_pd_reductionTreshold" type="float" size="10" value="0.8" label="Potential Density reduction (0.0 - 1.0)" /> + <param name="cs_pears_pd_reductionSlope" type="float" size="10" value="0.01" label="Potential Density reduction softness " /> + <param name="cs_stop_criterion" type="float" size="10" value="2" label="Stop Criterion " /> + </when> + </conditional> + <conditional name="clusteringType"> + <param name="type" type="select" label="Classify using ==========================================================="> + <option value="original" selected="true">Original - Fuzzy clustering, keep original centers and REUSE (scan distance) tolerances</option> + <option value="fuzzyCMeans">(experimental) Fuzzy C-Means - Fuzzy clustering, optimize centers</option> + </param> + <when value="original"> + <param name="clust_membership_cutoff" type="float" size="10" value="" + label="Membership cutoff (0.0 - 1.0)" + help="Items with membership below this value are NOT added to the cluster"/> + </when> + <!-- one idea would be to have clustering specific tolerance values, not reusing the centrotype selection ones + <when value="originalNewTol"> + <param name="clust_scan_toler" type="float" size="10" value="10" label="Peak Width, in scans" /> + <param name="clust_scan_slope" type="float" size="10" value="2" label="Peak Width margin softness" /> + </when> + --> + <when value="fuzzyCMeans"> + <param name="fcmMembershipWeightingExponent" type="float" size="10" value="2.0" label="Membership Weighting Exponent" help="Influences cluster center repositioning in the iterations 1.1 (exploratory) to around 3.0 (conservative)" /> + <param name="fcmStopCriterion" type="float" size="10" value="0.05" label="Stop Criterion" help="When convergence is 'reached' (e.g. 0.05 means memberships only changed with 5% in last iteration)" /> + <param name="fcmCorrelationWeight" type="float" size="10" value="2" label="Correlation weight factor" help="Increase this if you think the correlation is reliable (e.g. you have a high number of samples)" /> + <conditional name="finalClusterAssembly"> + <param name="type" type="select" label="Final cluster assembly" > + <option value="original" selected="true">Original - distance based</option> + <option value="membershipBased">Membership based</option> + </param> + <when value="membershipBased"> + <param name="fcmMembershipCutoff" type="select" label="Maximum allowed peak overlap" > + <option value="0.05" >~7 clusters</option> + <option value="0.10" >~5 clusters</option> + <option value="0.20" >~3 clusters</option> + </param> + </when> + <when value="original"> + <!-- nothing --> + </when> + </conditional> + </when> + </conditional> + + <param name="summaryReport" type="boolean" checked="true" label="Generate summary report" help="NB: this will increase the processing time (in some cases up to a few extra minutes)"/> + + <conditional name="advancedSettings"> + <param name="settings" type="boolean" truevalue="Yes" falsevalue="No" checked="false" label="Advanced settings ========================================================"/> + <when value="Yes"> + <param name="saturationLimit" optional="true" type="integer" size="10" label="Saturation limit (optional)" help="fill in if you have saturation problems in your data" /> + <param name="sampleSelectionSortType" type="select" label="Sample selection scheme for spectrum peak intensity correction algorithm (optional/experimental)" help="The intensity values to use to select the samples for each cluster/metabolite in which it is most intense/abundant. These samples are used in the peak intensity correction (see parameter below). Use this option to try to avoid samples that have insufficient signal or saturation." > + <option value="None">None</option> + <!-- in order of best FORWARD scoring when tested on /test/data/report_test_sets/(P2) Relative peak heights in spectra/Input (Test set 1) --> + <option value="SIM_INTENSITY" selected="true">SIM intensities</option> + <option value="MAX_INTENSITY">Maximum intensities</option> + <option value="CENTROTYPE_INTENSITY">Centrotype peak intensities</option> + <option value="MIC_INTENSITY">MIC intensities</option> + </param> + <param name="peakIntensityCorrectionAlgorithm" type="select" label="Spectrum peak intensity correction algorithm (optional/experimental)" help="Whether spectrum peak heights should be adjusted according to their membership to the cluster or to their correlation to the cluster's centrotype ion" > + <option value="MEMBERSHIP_BASED">Membership based (msclust 1.0 mode)</option> + <option value="CORRELATION_BASED" selected="true">Correlation based</option> + </param> + <param name="simSelectionAlgorithm" type="select" label="SIM selection algorithm (experimental)" help="Set this if you want to deviate from the standard which is: allow shared SIM peaks for GC-MS data, and force unique SIM peaks for LC-MS data"> + <option value="" selected="true"></option> + <option value="uniqueSIM">Unique SIM peak</option> + <option value="sharedSIM">Shared SIM peak</option> + </param> + <param name="simMassFilter" type="text" optional="true" size="30" label="SIM mass exclusion list" help="Comma-separated list of masses NOT to use as SIM peaks. E.g. '73,147,...' " /> + <param name="simMembershipThreshold" optional="true" type="float" size="10" label="SIM membership threshold" help="Minimum membership a peak should have to qualify as a SIM candidate. E.g. 0.8 " /> + <param name="simSaturationThreshold" optional="true" type="float" size="10" label="SIM saturation threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be saturated. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found this criteria, mass 0 is reported" /> + <param name="simAbsenseThreshold" optional="true" type="float" size="10" label="SIM absence threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be absent. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found meeting this criteria, mass 0 is reported" /> + + <param name="micMembershipThreshold" optional="true" type="float" size="10" label="MIC membership threshold" help="Minimum membership a peak should have to be counted in the MIC sum. E.g. 0.8 " /> + + </when> + <when value="No"> + </when> + </conditional> + + + </inputs> + <outputs> + <data name="centrotypesOut" format="msclust.csv" label="${tool.name} on ${on_string} - centrotypes file"/> + <data name="simOut" format="msclust.csv" label="${tool.name} on ${on_string} - SIM file"/> + <data name="micOut" format="msclust.csv" label="${tool.name} on ${on_string} - MIC file"/> + <data name="mspOut" format="msp" label="${tool.name} on ${on_string} - SPECTRA file"/> + <data name="classOut" format="msclust.csv" label="${tool.name} on ${on_string} - Classification file"/> + <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"> + <!-- If the expression is false, the file is not created --> + <filter>( summaryReport == True )</filter> + </data> + </outputs> + <tests> + <!-- find out how to use --> + </tests> + <help> + +<!-- see also http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-targets --> + +.. class:: infomark + +This tool extracts spectra from ion-wise aligned MS(/MS) results. It uses expression profiles and +retention times of the putative ions to cluster them. Each cluster is then used to generate +one spectrum containing the clustered ions (peaks). + +.. image:: msclust_summary.png + + +----- + +**Input** + +The input file should contain the following columns (in this order), followed by the sample intensity columns (one column with the +intensity value for each sample): + +*ScanNR* + +*Ret(umin)* + +*Mass(uD)* + +*(Optional)retentionMean* + +*(only required if retentionMean is present)retentionSD* + +*N sample intensity columns...* + + +----- + +**Output** + +This tools returns a number of ouptut files and a small report. + +**Parameters index** + + +*Select the approach used for imputing missing values:* only select this if you have used a specific method to +fill in the data gaps in the input file. One example is replacing zero values by some randomly generated low value. +If MeTot is chosen, then a value is considered generated if: the value contains a dot '.' and some number +other than 0 (zero) after the dot. + +*Effective Peaks:* Neighborhood window size to consider when calculating density. Smaller values increase +performance but are less reliable. + +*Peak Width, in scans:* Scan window width of scans to consider 'close'. One can see this as the +'tolerated variation in scans' for the apex positions of the fragment peaks composing a cluster. +Note: if MetAlign was used, this is the variation *after* pre-processing by MetAlign. + +*Peak Width confidence:* The higher the confidence, the stricter the threshold. + +*Correlation threshold (0.0 - 1.0):* Tolerance center for pearson distance calculation. The higher this value, +the higher the correlation between 2 items has to be for them to be considered 'close'. + +*Correlation threshold confidence:* The higher the confidence, the stricter the threshold. `More...`__ + +*Potential Density reduction (0.0 - 1.0):* Reduction tolerance center for pearson distance calculation. +The higher this value, the less the low correlated items get reduced, getting a chance to form a cluster of their own. + +*Potential Density reduction softness:* Reduction curve slope for pearson distance tolerance. Lower +values = stricter separation at the value determined in 'Potential Density reduction' above +(TODO review this comment). + +*Stop Criterion:* When to stop reducing and looking for new clusters. Lower values = more iterations + +.. __: javascript:window.open('.. image:: confidence_and_slope_params_explain.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') + + +----- + +**Output files described below** + +----- + +*SPECTRA:* this file can be submitted to NIST for identification of the spectra. + +`Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation`_ + +.. _Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation: javascript:window.open('.. image:: sample_sel_and_peak_height_correction.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') + +----- + +*MIC:* stands for Measured Ions Count -> it contains, for each cluster, the sum of the ion count +values (corrected by their membership) for all MEASURED cluster ions in the given sample. + +The MIC for a **cluster i** in **sample s**, where **cluster i** has **n** members is thus: + +sum ( [intensity of member n in **sample s**] x [membership value of member n in **cluster i** ] ) + +----- + +*SIM:* stands for Selective Ion Mode -> it contains, for each cluster, the intensity values of the +most representative member ion peak of this cluster. The most representative member peak is the one with the +highest membership*average_intensity. This definition leads to conflicts as a peak can have a +membership in two or more clusters. The assignment of a SIM peak to a cluster depends on +the configured data type (LC or GC-MS). NB: this can be overruled in the "advanced settings": + +(1) LC-MS SIM: select SIM peak only once and for the centrotype in which this specific mass has its +highest membership; for neighboring centrotypes use its "second best SIM", etcetera. In other words, +if the SIM peak has been identified as the SIM in more than 1 cluster, assign as SIM to the cluster +with highest membership. Continue searching for other SIM peaks to assign to the other clusters until +all ambiguities are solved. + +(2) GC-MS SIM: the SIM peak can be "shared" by multiple clusters. However, the intensity values are corrected +by the membership value of the peak in the cluster in case the SIM peak is "shared". If the SIM peak is not +"shared" then the "raw" intensity values of the SIM peak are recorded in the SIM file. + +`Click here for more details on the SIM output file`_ + +.. _Click here for more details on the SIM output file: javascript:window.open('.. image:: sample_SIM.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') + + +**References** + +If you use this Galaxy tool in work leading to a scientific publication please +cite the following papers: + +Y. M. Tikunov, S. Laptenok, R. D. Hall, A. Bovy, and R. C. H. de Vos (2012). +MSClust: a tool for unsupervised mass spectra extraction of +chromatography-mass spectrometry ion-wise aligned data +http://dx.doi.org/10.1007%2Fs11306-011-0368-2 + + </help> +</tool>
--- a/msclust2.0.1.xml Thu Jan 16 13:22:38 2014 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,289 +0,0 @@ -<tool name="MsClust" id="msclust2" version="2.0.1"> - <description>Extracts fragmentation spectra from aligned data</description> - <!-- - For remote debugging start you listener on port 8000 and use the following as command interpreter: - java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 - ////////////////////////// - - TODO in command below: add conditionals according to options of using or NOT the tolerances/thresholds from previous steps - --> - <command interpreter="java -jar "> - MsClust.jar - -peaksFileName $inputPeaks - -dataType $dataType - -imputationMethod $imputationMethod.type - #if $imputationMethod.type == "valueRange" - -rangeUpperLimit $imputationMethod.rangeUpperLimit - #end if - -plInputFormat "metalign" - -potDensFuncType $potDensFuncType.type - -centerSelectionType $centerSelectionType.type - -clusteringType $clusteringType.type - -neighborhoodWindowSize $potDensFuncType.pdf_neighborhoodWindowSize - -clusterSearchStopCriterium $centerSelectionType.cs_stop_criterion - -pearsonDistTreshold $potDensFuncType.pdf_pears_treshold - -pearsonTresholdConfidence $potDensFuncType.pdf_pears_conf - -pearsonPDReductionThreshold $centerSelectionType.cs_pears_pd_reductionTreshold - -pearsonPDReductionSlope $centerSelectionType.cs_pears_pd_reductionSlope - -scanDistTol $potDensFuncType.pdf_scan_toler - -scanDistanceConfidence $potDensFuncType.pdf_scan_conf - -centrotypesOut $centrotypesOut - -simOut $simOut - -micOut $micOut - -mspOut $mspOut - -classOut $classOut - -outReport $htmlReportFile - -outReportPicturesPath $htmlReportFile.files_path - #if $clusteringType.type == "fuzzyCMeans" - -fcmMembershipWeightingExponent $clusteringType.fcmMembershipWeightingExponent - -fcmStopCriterion $clusteringType.fcmStopCriterion - -fcmCorrelationWeight $clusteringType.fcmCorrelationWeight - -fcmFinalAssemblyType $clusteringType.finalClusterAssembly.type - #if $clusteringType.finalClusterAssembly.type == "membershipBased" - -fcmMembershipCutoff $clusteringType.finalClusterAssembly.fcmMembershipCutoff - #end if - #end if - -verbose "false" - #if $advancedSettings.settings == True - -advancedSettings YES - -saturationLimit $advancedSettings.saturationLimit - -sampleSelectionSortType $advancedSettings.sampleSelectionSortType - -simSelectionAlgorithm $advancedSettings.simSelectionAlgorithm - -simMassFilter "$advancedSettings.simMassFilter" - -simMembershipThreshold $advancedSettings.simMembershipThreshold - -simSaturationThreshold $advancedSettings.simSaturationThreshold - -simAbsenseThreshold $advancedSettings.simAbsenseThreshold - -micMembershipThreshold $advancedSettings.micMembershipThreshold - -peakIntensityCorrectionAlgorithm $advancedSettings.peakIntensityCorrectionAlgorithm - #else - -advancedSettings YES - -sampleSelectionSortType SIM_INTENSITY - -peakIntensityCorrectionAlgorithm CORRELATION_BASED - #end if - - </command> - <inputs> - <!-- <param name="rankingWeightConfig" type="text" area="true" size="11x70" label="NB - TEST VERSION" -value="VERSION BEING TESTED AT THIS MOMENT...NOT READY FOR USE..."/> - --> - <param name="inputPeaks" type="data" format="txt" label="Ion-wise aligned data (e.g. MetAlign output data)" /> - <param name="dataType" type="select" size="30" label="Data type"> - <option value="gcms" selected="true">GC-MS</option> - <option value="lcms">LC-MS</option> - </param> - <conditional name="imputationMethod"> - <param name="type" type="select" size="30" label="Select the approach used for imputing missing values (optional)" help="select how you generated the values to fill in the data gaps"> - <option value="none" >none</option> - <option value="metot" selected="true">MeTot</option> - <option value="valueRange">Values range</option> - </param> - <when value="valueRange"> - <param name="rangeUpperLimit" type="integer" size="10" value="0" label="Range upper limit" help="values up to this limit will be considered 'generated' values" /> - </when> - </conditional> - <conditional name="potDensFuncType"> - <param name="type" type="select" size="30" label="Select PD function type ====================================================="> - <option value="original" selected="true">Original</option> - </param> - <when value="original"> - <param name="pdf_neighborhoodWindowSize" type="integer" size="10" value="200" label="Effective Peaks" /> - <param name="pdf_scan_toler" type="float" size="10" value="10" label="Peak Width, in scans" /> - <param name="pdf_scan_conf" type="float" size="10" value="80" label="Peak Width confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" /> - <param name="pdf_pears_treshold" type="float" size="10" value="0.8" label="Correlation threshold (0.0 - 1.0)" /> - <param name="pdf_pears_conf" type="float" size="10" value="98.0" label="Correlation threshold confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" /> - </when> - </conditional> - <conditional name="centerSelectionType"> - <param name="type" type="select" label="Initial Centers selection type ==================================================" > - <option value="original" selected="true">Original - Subtractive potential reductions with stop criterion and REUSE tolerances (from PD function)</option> - </param> - <when value="original"> - <param name="cs_pears_pd_reductionTreshold" type="float" size="10" value="0.8" label="Potential Density reduction (0.0 - 1.0)" /> - <param name="cs_pears_pd_reductionSlope" type="float" size="10" value="0.01" label="Potential Density reduction softness " /> - <param name="cs_stop_criterion" type="float" size="10" value="2" label="Stop Criterion " /> - </when> - </conditional> - <conditional name="clusteringType"> - <param name="type" type="select" label="Classify using ==========================================================="> - <option value="original" selected="true">Original - Fuzzy clustering, keep original centers and REUSE (scan distance) tolerances</option> - <option value="fuzzyCMeans">(experimental) Fuzzy C-Means - Fuzzy clustering, optimize centers</option> - </param> - <when value="original"> - <!-- nothing --> - </when> - <when value="originalNewTol"> - <param name="clust_scan_toler" type="float" size="10" value="10" label="Peak Width, in scans" /> - <param name="clust_scan_slope" type="float" size="10" value="2" label="Peak Width margin softness" /> - </when> - <when value="fuzzyCMeans"> - <param name="fcmMembershipWeightingExponent" type="float" size="10" value="2.0" label="Membership Weighting Exponent" help="Influences cluster center repositioning in the iterations 1.1 (exploratory) to around 3.0 (conservative)" /> - <param name="fcmStopCriterion" type="float" size="10" value="0.05" label="Stop Criterion" help="When convergence is 'reached' (e.g. 0.05 means memberships only changed with 5% in last iteration)" /> - <param name="fcmCorrelationWeight" type="float" size="10" value="2" label="Correlation weight factor" help="Increase this if you think the correlation is reliable (e.g. you have a high number of samples)" /> - <conditional name="finalClusterAssembly"> - <param name="type" type="select" label="Final cluster assembly" > - <option value="original" selected="true">Original - distance based</option> - <option value="membershipBased">Membership based</option> - </param> - <when value="membershipBased"> - <param name="fcmMembershipCutoff" type="select" label="Maximum allowed peak overlap" > - <option value="0.05" >~7 clusters</option> - <option value="0.10" >~5 clusters</option> - <option value="0.20" >~3 clusters</option> - </param> - </when> - <when value="original"> - <!-- nothing --> - </when> - </conditional> - </when> - </conditional> - - <param name="summaryReport" type="boolean" checked="true" label="Generate summary report" help="NB: this will increase the processing time (in some cases up to a few extra minutes)"/> - - <conditional name="advancedSettings"> - <param name="settings" type="boolean" truevalue="Yes" falsevalue="No" checked="false" label="Advanced settings ========================================================"/> - <when value="Yes"> - <param name="saturationLimit" optional="true" type="integer" size="10" label="Saturation limit (optional)" help="fill in if you have saturation problems in your data" /> - <param name="sampleSelectionSortType" type="select" label="Sample selection scheme for spectrum peak intensity correction algorithm (optional/experimental)" help="The intensity values to use to select the samples for each cluster/metabolite in which it is most intense/abundant. These samples are used in the peak intensity correction (see parameter below). Use this option to try to avoid samples that have insufficient signal or saturation." > - <option value="None">None</option> - <!-- in order of best FORWARD scoring when tested on /test/data/report_test_sets/(P2) Relative peak heights in spectra/Input (Test set 1) --> - <option value="SIM_INTENSITY" selected="true">SIM intensities</option> - <option value="MAX_INTENSITY">Maximum intensities</option> - <option value="CENTROTYPE_INTENSITY">Centrotype peak intensities</option> - <option value="MIC_INTENSITY">MIC intensities</option> - </param> - <param name="peakIntensityCorrectionAlgorithm" type="select" label="Spectrum peak intensity correction algorithm (optional/experimental)" help="Whether spectrum peak heights should be adjusted according to their membership to the cluster or to their correlation to the cluster's centrotype ion" > - <option value="MEMBERSHIP_BASED">Membership based (msclust 1.0 mode)</option> - <option value="CORRELATION_BASED" selected="true">Correlation based</option> - </param> - <param name="simSelectionAlgorithm" type="select" label="SIM selection algorithm (experimental)" help="Set this if you want to deviate from the standard which is: allow shared SIM peaks for GC-MS data, and force unique SIM peaks for LC-MS data"> - <option value="" selected="true"></option> - <option value="uniqueSIM">Unique SIM peak</option> - <option value="sharedSIM">Shared SIM peak</option> - </param> - <param name="simMassFilter" type="text" optional="true" size="30" label="SIM mass exclusion list" help="Comma-separated list of masses NOT to use as SIM peaks. E.g. '73,147,...' " /> - <param name="simMembershipThreshold" optional="true" type="float" size="10" label="SIM membership threshold" help="Minimum membership a peak should have to qualify as a SIM candidate. E.g. 0.8 " /> - <param name="simSaturationThreshold" optional="true" type="float" size="10" label="SIM saturation threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be saturated. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found this criteria, mass 0 is reported" /> - <param name="simAbsenseThreshold" optional="true" type="float" size="10" label="SIM absence threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be absent. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found meeting this criteria, mass 0 is reported" /> - - <param name="micMembershipThreshold" optional="true" type="float" size="10" label="MIC membership threshold" help="Minimum membership a peak should have to be counted in the MIC sum. E.g. 0.8 " /> - - </when> - </conditional> - - - </inputs> - <outputs> - <data name="centrotypesOut" format="msclust.csv" label="${tool.name} on ${on_string} - centrotypes file"/> - <data name="simOut" format="msclust.csv" label="${tool.name} on ${on_string} - SIM file"/> - <data name="micOut" format="msclust.csv" label="${tool.name} on ${on_string} - MIC file"/> - <data name="mspOut" format="msp" label="${tool.name} on ${on_string} - SPECTRA file"/> - <data name="classOut" format="msclust.csv" label="${tool.name} on ${on_string} - Classification file"/> - <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"> - <!-- If the expression is false, the file is not created --> - <filter>( summaryReport == True )</filter> - </data> - </outputs> - <tests> - <!-- find out how to use --> - </tests> - <help> - -<!-- see also http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-targets --> - -.. class:: infomark - -This tool extracts spectra from ion-wise aligned MS(/MS) results. It uses expression profiles and -retention times of the putative ions to cluster them. Each cluster is then used to generate -one spectrum containing the clustered ions (peaks). - -.. image:: $PATH_TO_IMAGES/msclust_summary.png - - ------ - -**Output** - -This tools returns a number of ouptut files and a small report. - -**Parameters index** - - -*Select the approach used for imputing missing values:* only select this if you have used a specific method to -fill in the data gaps in the input file. One example is replacing zero values by some randomly generated low value. -If MeTot is chosen, then a value is considered generated if: the value contains a dot '.' and some number -other than 0 (zero) after the dot. - -*Effective Peaks:* Neighborhood window size to consider when calculating density. Smaller values increase -performance but are less reliable. - -*Peak Width, in scans:* Scan window width of scans to consider 'close'. One can see this as the -'tolerated variation in scans' for the apex positions of the fragment peaks composing a cluster. -Note: if MetAlign was used, this is the variation *after* pre-processing by MetAlign. - -*Peak Width confidence:* The higher the confidence, the stricter the threshold. - -*Correlation threshold (0.0 - 1.0):* Tolerance center for pearson distance calculation. The higher this value, -the higher the correlation between 2 items has to be for them to be considered 'close'. - -*Correlation threshold confidence:* The higher the confidence, the stricter the threshold. `More...`__ - -*Potential Density reduction (0.0 - 1.0):* Reduction tolerance center for pearson distance calculation. -The higher this value, the less the low correlated items get reduced, getting a chance to form a cluster of their own. - -*Potential Density reduction softness:* Reduction curve slope for pearson distance tolerance. Lower -values = stricter separation at the value determined in 'Potential Density reduction' above -(TODO review this comment). - -*Stop Criterion:* When to stop reducing and looking for new clusters. Lower values = more iterations - -.. __: javascript:window.open('$PATH_TO_IMAGES/confidence_and_slope_params_explain.png','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') - - ------ - -**Output files described below** - ------ - -*SPECTRA:* this file can be submitted to NIST for identification of the spectra. - -`Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation`_ - -.. _Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation: javascript:window.open('$PATH_TO_IMAGES/sample_sel_and_peak_height_correction.png','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') - ------ - -*MIC:* stands for Measured Ions Count -> it contains, for each cluster, the sum of the ion count -values (corrected by their membership) for all MEASURED cluster ions in the given sample. - -The MIC for a **cluster i** in **sample s**, where **cluster i** has **n** members is thus: - -sum ( [intensity of member n in **sample s**] x [membership value of member n in **cluster i** ] ) - ------ - -*SIM:* stands for Selective Ion Mode -> it contains, for each cluster, the intensity values of the -most representative member ion peak of this cluster. The most representative member peak is the one with the -highest membership*average_intensity. This definition leads to conflicts as a peak can have a -membership in two or more clusters. The assignment of a SIM peak to a cluster depends on -the configured data type (LC or GC-MS). NB: this can be overruled in the "advanced settings": - -(1) LC-MS SIM: select SIM peak only once and for the centrotype in which this specific mass has its -highest membership; for neighboring centrotypes use its "second best SIM", etcetera. In other words, -if the SIM peak has been identified as the SIM in more than 1 cluster, assign as SIM to the cluster -with highest membership. Continue searching for other SIM peaks to assign to the other clusters until -all ambiguities are solved. - -(2) GC-MS SIM: the SIM peak can be "shared" by multiple clusters. However, the intensity values are corrected -by the membership value of the peak in the cluster in case the SIM peak is "shared". If the SIM peak is not -"shared" then the "raw" intensity values of the SIM peak are recorded in the SIM file. - -`Click here for more details on the SIM output file`_ - -.. _Click here for more details on the SIM output file: javascript:window.open('$PATH_TO_IMAGES/sample_SIM.png','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') - - - - </help> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/primsfilters.py Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,44 @@ +import logging +log = logging.getLogger( __name__ ) + + +def restrict_prims_metabolomics( context, tool ): + """ + This tool filter will hide prims_metabolomics tools for non-metabolomics users. + This can be enabled by adding the following to the + ``app:main`` section of ``universe_wsgi.ini``:: + + tool_filters = primsfilters:restrict_prims_metabolomics + + and by adding this file to the folder: + + <galaxy-dist>/lib/galaxy/tools/filters + + This is optional and can be used in case some control is desired on whom + gets to see the prims_metabolomics tools. When not using this file and the + settings mentioned above, all prims_metabolomics tools will be visible to + all users. + """ + # for debugging: import pydevd;pydevd.settrace("L0136815.wurnet.nl") + user = context.trans.user + metabolomics_tools = [ "msclust2", "combine_output", "create_poly_model", "lookup_library", + "NDIStext2tabular", "rankfilterGCMS_tabular", "filter_on_rank", + "export_to_metexp_tabular", "query_metexp" ] + found_match = False + # iterate over the tool (partial)ids and look for a match (this is compatible with tool shed given ids): + for partial_id in metabolomics_tools: + if tool.id.find("/"+ partial_id + "/") >= 0: + found_match = True + break + # the second part of this if is compatible with the ids when NOT using tool shed: + if found_match or tool.id in metabolomics_tools: + # logging.warn( 'FILTER MATCHED: %s' %(tool.name)) + + for user_role in user.roles: + if user_role.role.name == "PRIMS_METABOLOMICS": + return True + # not found to have the role, return false: + return False + else: + # return true for any other tool + return True \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_mass_repos.py Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,289 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to query a set of accurate mass values detected by high-resolution mass spectrometers +against various repositories/services such as METabolomics EXPlorer database or the +MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/). + +It will take the input file and for each record it will query the +molecular mass in the selected repository/service. If one or more compounds are found +then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected repository/service. + +The service should implement the following interface: + +http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data) + +The output should be tab separated and should contain the following columns (in this order) +db-name molecular-formula dbe formula-weight id description + + +''' +import csv +import sys +import fileinput +import urllib2 +import time +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2014, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_file(in_xsv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) + return _process_data(data) + +def _process_data(data): + + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit): + + ''' + This method will iterate over the record in the input_data and + will enrich them with the related information found (if any) in the + chosen repository/service + + # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies + ''' + merged = [] + + for i in xrange(len(input_data[input_data.keys()[0]])): + # Get the record in same dictionary format as input_data, but containing + # a value at each column instead of a list of all values of all records: + input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) + + # read the molecular mass : + molecular_mass = input_data_record[molecular_mass_col] + + + # search for related records in repository/service: + data_found = None + if molecular_mass != "": + molecular_mass = float(molecular_mass) + + # 1- search for data around this MM: + query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth" + + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "MM" + + + if data_found == None: + # If still nothing found, just add empty columns + extra_cols = ['', '','','','',''] + else: + # Add info found: + extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) + + # Take all data and merge it into a "flat"/simple array of values: + field_values_list = _merge_data(input_data_record, extra_cols) + + merged.append(field_values_list) + + # return the merged/enriched records: + return merged + + +def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): + ''' + This method will go over the data found and will return a + list with the following items: + - details of hits found : + db-name molecular-formula dbe formula-weight id description + - Link that executes same query + + ''' + + # set() makes a unique list: + db_name_set = [] + molecular_formula_set = [] + id_set = [] + description_set = [] + + + if 'db-name' in data_found: + db_name_set = set(data_found['db-name']) + elif '# db-name' in data_found: + db_name_set = set(data_found['# db-name']) + if 'molecular-formula' in data_found: + molecular_formula_set = set(data_found['molecular-formula']) + if 'id' in data_found: + id_set = set(data_found['id']) + if 'description' in data_found: + description_set = set(data_found['description']) + + result = [data_type_found, + _to_xsv(db_name_set), + _to_xsv(molecular_formula_set), + _to_xsv(id_set), + _to_xsv(description_set), + #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): + "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] + return result + + +def _to_xsv(data_set): + result = "" + for item in data_set: + result = result + str(item) + "|" + return result + + +def _fire_query_and_return_dict(url): + ''' + This method will fire the query as a web-service call and + return the results as a list of dictionary objects + ''' + + try: + data = urllib2.urlopen(url).read() + + # transform to dictionary: + result = [] + data_rows = data.split("\n") + + # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service): + data_rows_to_remove = [] + for data_row in data_rows: + if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row): + data_rows_to_remove.append(data_row) + + for data_row in data_rows_to_remove: + data_rows.remove(data_row) + + # check if there is any data in the response: + if len(data_rows) <= 1 or data_rows[1].strip() == '': + # means there is only the header row...so no hits: + return None + + for data_row in data_rows: + if not data_row.strip() == '': + row_as_list = _str_to_list(data_row, delimiter='\t') + result.append(row_as_list) + + # return result processed into a dict: + return _process_data(result) + + except urllib2.HTTPError, e: + raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) + except urllib2.URLError, e: + raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ") + +def _str_to_list(data_row, delimiter='\t'): + result = [] + for column in data_row.split(delimiter): + result.append(column) + return result + + +# alternative: ? +# s = requests.Session() +# s.verify = False +# #s.auth = (token01, token02) +# resp = s.get(url, params={'name': 'anonymous'}, stream=True) +# content = resp.content +# # transform to dictionary: + + + + +def _merge_data(input_data_record, extra_cols): + ''' + Adds the extra information to the existing data record and returns + the combined new record. + ''' + record = [] + for column in input_data_record: + record.append(input_data_record[column]) + + + # add extra columns + for column in extra_cols: + record.append(column) + + return record + + +def _save_data(data_rows, headers, out_csv): + ''' + Writes tab-separated data to file + @param data_rows: dictionary containing merged/enriched dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write one line for each row + for data_row in data_rows: + output_single_handle.writerow(data_row) + +def _get_repository_URL(repository_file): + ''' + Read out and return the URL stored in the given file. + ''' + file_input = fileinput.input(repository_file) + try: + for line in file_input: + if line[0] != '#': + # just return the first line that is not a comment line: + return line + finally: + file_input.close() + + +def main(): + ''' + Query main function + + The input file can be any tabular file, as long as it contains a column for the molecular mass. + This column is then used to query against the chosen repository/service Database. + ''' + seconds_start = int(round(time.time())) + + input_file = sys.argv[1] + molecular_mass_col = sys.argv[2] + repository_file = sys.argv[3] + error_margin = float(sys.argv[4]) + margin_unit = sys.argv[5] + output_result = sys.argv[6] + + # Parse repository_file to find the URL to the service: + repository_dblink = _get_repository_URL(repository_file) + + # Parse tabular input file into dictionary/array: + input_data = _process_file(input_file) + + # Query data against repository : + enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit) + headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ', + 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns + + _save_data(enriched_data, headers, output_result) + + seconds_end = int(round(time.time())) + print "Took " + str(seconds_end - seconds_start) + " seconds" + + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_mass_repos.xml Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,106 @@ +<tool id="query_mass_repos" + name="METEXP - Find elemental composition formulas based on mass values " + version="0.1.0"> + <description>Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers</description> + <command interpreter="python"> + query_mass_repos.py + $input_file + "$molecular_mass_col" + "$repository_file" + $error_margin + $margin_unit + $output_result + </command> + <inputs> + + <param name="input_file" format="tabular" type="data" + label="Input file" + help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/> + + <param name="molecular_mass_col" type="text" size="50" + label="Molecular mass column name" + value="MM" + help="Name of the column containing the molecular mass information (in the given input file)" /> + + <param name="repository_file" type="select" label="Repository/service to query" + help="Select the repository/service which should be queried" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_MassSearch_Services")'/> + + <param name="error_margin" type="float" size="10" + label="Error marging" + value="0.01" + help="Mass difference allowed when searching in the repositories for a mass match." /> + + <param name="margin_unit" type="select" label="Margin unit"> + <option value="ms" selected="True">ms</option> + <option value="ppm">ppm</option> + </param> + <!-- TODO + <param name="metexp_access_key" type="text" size="50" + label="(Optional)MetExp access key" + value="" + help="Key needed to get access to MetExp services. Fill in if MetExp service was selected" /> --> + + </inputs> + <outputs> + <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <code file="match_library.py" /> <!-- file containing get_directory_files function used above--> + <help> +.. class:: infomark + +This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher +for elemental compositions from accurate mass values detected by high-resolution mass spectrometers. + +It will take the input file and for each record it will query the +molecular mass in the selected repository. If one or more compounds are found in the +repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected repository. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass. + +**Services that can be queried** + +================= ========================================================================= +Database Description +----------------- ------------------------------------------------------------------------- +PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at + Plant Research International. NB: restricted access to employees with + access key. +ExactMassDB A database of possible elemental compositions consits of C: 100, + H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and + the Lewis valence rules. + (via /mfsearcher/exmassdb/) +ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental + compositions, filters some elemental compositions according to + the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2 + database returns the same result as does HR2 with the same atom kind + and number condition as that used in construction of the ExactMassDB. + (via /mfsearcher/exmassdb-hr2/) +Pep1000 A database of possible linear polypeptides that are + constructed with 20 kinds of amino acids and having molecular + weights smaller than 1000. + (via /mfsearcher/pep1000/) +KEGG Re-calculated compound data from KEGG. Weekly updated. + (via /mfsearcher/kegg/) +KNApSAcK Re-calculated compound data from KNApSAcK. + (via /mfsearcher/knapsack/) +Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer . + (via /mfsearcher/flavonoidviewer/ +LipidMAPS Re-calculated compound data from LIPID MAPS. + (via /mfsearcher/lipidmaps/) +HMDB Re-calculated compound data from Human Metabolome Database (HMDB) + Version 3.5. + (via /mfsearcher/hmdb/) +PubChem Re-calculated compound data from PubChem. Monthly updated. + (via /mfsearcher/pubchem/) +================= ========================================================================= + +Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_metexp.py Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,282 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. +''' +import csv +import sys +import fileinput +import urllib2 +import time +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2014, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_file(in_xsv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) + return _process_data(data) + +def _process_data(data): + + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method): + ''' + This method will iterate over the record in the input_data and + will enrich them with the related information found (if any) in the + MetExp Database. + + # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies + ''' + merged = [] + + for i in xrange(len(input_data[input_data.keys()[0]])): + # Get the record in same dictionary format as input_data, but containing + # a value at each column instead of a list of all values of all records: + input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) + + # read the molecular mass and formula: + cas_id = input_data_record[casid_col] + formula = input_data_record[formula_col] + molecular_mass = input_data_record[molecular_mass_col] + + # search for related records in MetExp: + data_found = None + if cas_id != "undef": + # 1- search for other experiments where this CAS id has been found: + query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "CAS" + if data_found == None: + # 2- search for other experiments where this FORMULA has been found: + query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "FORMULA" + if data_found == None: + # 3- search for other experiments where this MM has been found: + query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "MM" + + if data_found == None: + # If still nothing found, just add empty columns + extra_cols = ['', '','','','','','',''] + else: + # Add info found: + extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) + + # Take all data and merge it into a "flat"/simple array of values: + field_values_list = _merge_data(input_data_record, extra_cols) + + merged.append(field_values_list) + + # return the merged/enriched records: + return merged + + +def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): + ''' + This method will go over the data found and will return a + list with the following items: + - Experiment details where hits have been found : + 'organism', 'tissue','experiment_name','user_name','column_type' + - Link that executes same query + + ''' + # set() makes a unique list: + organism_set = [] + tissue_set = [] + experiment_name_set = [] + user_name_set = [] + column_type_set = [] + cas_nr_set = [] + + if 'organism' in data_found: + organism_set = set(data_found['organism']) + if 'tissue' in data_found: + tissue_set = set(data_found['tissue']) + if 'experiment_name' in data_found: + experiment_name_set = set(data_found['experiment_name']) + if 'user_name' in data_found: + user_name_set = set(data_found['user_name']) + if 'column_type' in data_found: + column_type_set = set(data_found['column_type']) + if 'CAS' in data_found: + cas_nr_set = set(data_found['CAS']) + + + result = [data_type_found, + _to_xsv(organism_set), + _to_xsv(tissue_set), + _to_xsv(experiment_name_set), + _to_xsv(user_name_set), + _to_xsv(column_type_set), + _to_xsv(cas_nr_set), + #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): + "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] + return result + + +def _to_xsv(data_set): + result = "" + for item in data_set: + result = result + str(item) + "|" + return result + + +def _fire_query_and_return_dict(url): + ''' + This method will fire the query as a web-service call and + return the results as a list of dictionary objects + ''' + + try: + data = urllib2.urlopen(url).read() + + # transform to dictionary: + result = [] + data_rows = data.split("\n") + + # check if there is any data in the response: + if len(data_rows) <= 1 or data_rows[1].strip() == '': + # means there is only the header row...so no hits: + return None + + for data_row in data_rows: + if not data_row.strip() == '': + row_as_list = _str_to_list(data_row, delimiter='\t') + result.append(row_as_list) + + # return result processed into a dict: + return _process_data(result) + + except urllib2.HTTPError, e: + raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) + except urllib2.URLError, e: + raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ") + +def _str_to_list(data_row, delimiter='\t'): + result = [] + for column in data_row.split(delimiter): + result.append(column) + return result + + +# alternative: ? +# s = requests.Session() +# s.verify = False +# #s.auth = (token01, token02) +# resp = s.get(url, params={'name': 'anonymous'}, stream=True) +# content = resp.content +# # transform to dictionary: + + + + +def _merge_data(input_data_record, extra_cols): + ''' + Adds the extra information to the existing data record and returns + the combined new record. + ''' + record = [] + for column in input_data_record: + record.append(input_data_record[column]) + + + # add extra columns + for column in extra_cols: + record.append(column) + + return record + + +def _save_data(data_rows, headers, out_csv): + ''' + Writes tab-separated data to file + @param data_rows: dictionary containing merged/enriched dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write one line for each row + for data_row in data_rows: + output_single_handle.writerow(data_row) + +def _get_metexp_URL(metexp_dblink_file): + ''' + Read out and return the URL stored in the given file. + ''' + file_input = fileinput.input(metexp_dblink_file) + try: + for line in file_input: + if line[0] != '#': + # just return the first line that is not a comment line: + return line + finally: + file_input.close() + + +def main(): + ''' + MetExp Query main function + + The input file can be any tabular file, as long as it contains a column for the molecular mass + and one for the formula of the respective identification. These two columns are then + used to query against MetExp Database. + ''' + seconds_start = int(round(time.time())) + + input_file = sys.argv[1] + casid_col = sys.argv[2] + formula_col = sys.argv[3] + molecular_mass_col = sys.argv[4] + metexp_dblink_file = sys.argv[5] + separation_method = sys.argv[6] + output_result = sys.argv[7] + + # Parse metexp_dblink_file to find the URL to the MetExp service: + metexp_dblink = _get_metexp_URL(metexp_dblink_file) + + # Parse tabular input file into dictionary/array: + input_data = _process_file(input_file) + + # Query data against MetExp DB : + enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method) + headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues', + 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits'] + + _save_data(enriched_data, headers, output_result) + + seconds_end = int(round(time.time())) + print "Took " + str(seconds_end - seconds_start) + " seconds" + + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_metexp.xml Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,69 @@ +<tool id="query_metexp" + name="METEXP - Query Database " + version="0.1.0"> + <description>Query a set of identifications against the METabolomics EXPeriments database</description> + <command interpreter="python"> + query_metexp.py + $input_file + "$casid_col" + "$formula_col" + "$molecular_mass_col" + "$metexp_dblink_file" + $separation_method + $output_result + </command> + <inputs> + + <param name="input_file" format="tabular" type="data" + label="Input file" + help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/> + + <param name="separation_method" type="select" label="Data type to query"> + <option value="GC" selected="True">GC</option> + <option value="LC">LC</option> + </param> + + + <param name="casid_col" type="text" size="50" + label="CAS ID column name" + value="CAS" + help="Name of the column containing the CAS code information (in the given input file)" /> + <param name="formula_col" type="text" size="50" + label="Formula ID column name" + value="FORMULA" + help="Name of the column containing the formula information (in the given input file)" /> + <param name="molecular_mass_col" type="text" size="50" + label="Molecular mass column name" + value="MM" + help="Name of the column containing the molecular mass information (in the given input file)" /> + + <param name="metexp_dblink_file" type="select" label="MetExp DB to query" + help="Select the MetExp Database/backend which should be queried" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_Databases")'/> + + + </inputs> + <outputs> + <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <code file="match_library.py" /> <!-- file containing get_directory_files function used above--> + <help> +.. class:: infomark + +This tool will Query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass +and one for the formula of the respective identification. + + + </help> +</tool>
--- a/rankfilterGCMS_tabular.xml Thu Jan 16 13:22:38 2014 +0100 +++ b/rankfilterGCMS_tabular.xml Fri Oct 24 12:52:56 2014 +0200 @@ -3,11 +3,14 @@ <command interpreter="python">rankfilter_GCMS/rankfilter.py $input_file</command> <inputs> <param format="tabular" name="sample" type="data" label="Sample File" - help="Converted PDF file in tabular format" /> + help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" /> <!-- question: is this calibration file not column specific as it includes RT info?? --> - <param name="calibration" type="select" label="Calibration File" + <!-- this one should be input file for now:<param name="calibration" type="select" label="Calibration File" help="Calibration file with reference masses (e.g. alkanes) with their RT and RI values" dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_Calibration_Files")'/> + --> + <param name="calibration" format="any" type="data" label="Calibration File" + help="Calibration file containing reference masses (e.g. alkanes) with their respective RT and RI values"/> <param name="analysis_type" type="select" format="text" label="Analysis Type" help="Select the type of analysis that has been used to generate the sample file">
--- a/rankfilter_GCMS/pdfread.py Thu Jan 16 13:22:38 2014 +0100 +++ b/rankfilter_GCMS/pdfread.py Fri Oct 24 12:52:56 2014 +0200 @@ -52,8 +52,9 @@ for line in hit_list: line = line.strip().translate(None, '\r') if line != '': - hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') - + hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler! + #strange....code seems fine actually...debug! See test/data/download.pdf + # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux? spec_id = hits.pop(0).split(' ')[1] j = 0 for hh in hits: @@ -69,8 +70,13 @@ name_tmp = ':'.join(cell[0].split(':')[1:]) else: name_tmp = cell[0].split(':')[1] + + # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive + # replaces of known cases by the same with a white space: name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') + name_tmp = name_tmp.replace('-, LC', '-, L C').replace('-, DC', '-, D C') + name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) if name_tmp: if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
--- a/rankfilter_GCMS/test/test_pdfread.py Thu Jan 16 13:22:38 2014 +0100 +++ b/rankfilter_GCMS/test/test_pdfread.py Fri Oct 24 12:52:56 2014 +0200 @@ -17,13 +17,20 @@ ''' Tests the reading and parsing of a NIST PDF file ''' - [hitlist, hitlist_missed] = pdfread.getPDF(self.nist_pdf) + [hitlist, hitlist_missed] = pdfread.getPDF(self.nist_pdf, True) rows = [hitlist[row] for row in hitlist.keys()] data = [set(row) for row in zip(*rows)] expected_element = set(('12.3', ' Sucrose ', '14', 'undef', ' standards 2009', ' 660', 'not_def', '18495-0.142537-21284-2.26544e+07-135', '22.6544', ' 714')) self.failUnless(expected_element in data) self.failUnless(len(hitlist_missed) != 0) + ''' + Check for last (dummy) hit: + Hit 6 : (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, LC21H52O6Si5;MF: 658; RMF: 658; Prob 15.6%; CAS: 30645-02-4; Lib: mainlib; ID: 37062. + ''' + expected_element = set(['C21H52O6Si5', ' 30645-02-4', ' mainlib', '15.6', ' (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, L C21H52O6Si5', '7298-1-9580-1.29014e+07-9', ' 658', '12.9014', '37062']) + self.failUnless(expected_element in data) + if __name__ == "__main__": #import sys;sys.argv = ['', 'Test.test_getPDF']
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,4 @@ +<?xml version="1.0"?> +<repositories description="Required metaMS dependencies."> + <repository toolshed="http://testtoolshed.g2.bx.psu.edu" name="prims_metabolomics_dependencies" owner="pieterlukasse" changeset_revision="62b3f4d6e015" /> +</repositories>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/static_resources/elements_and_masses.tab Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,104 @@ +Name Atomic number Chemical symbol Relative atomic mass +Hydrogen 1 H 1.01 +Helium 2 He 4 +Lithium 3 Li 6.94 +Beryllium 4 Be 9.01 +Boron 5 B 10.81 +Carbon 6 C 12.01 +Nitrogen 7 N 14.01 +Oxygen 8 O 16 +Fluorine 9 F 19 +Neon 10 Ne 20.18 +Sodium 11 Na 22.99 +Magnesium 12 Mg 24.31 +Aluminum 13 Al 26.98 +Silicon 14 Si 28.09 +Phosphorus 15 P 30.98 +Sulfur 16 S 32.06 +Chlorine 17 Cl 35.45 +Argon 18 Ar 39.95 +Potassium 19 K 39.1 +Calcium 20 Ca 40.08 +Scandium 21 Sc 44.96 +Titanium 22 Ti 47.9 +Vanadium 23 V 50.94 +Chromium 24 Cr 52 +Manganese 25 Mn 54.94 +Iron 26 Fe 55.85 +Cobalt 27 Co 58.93 +Nickel 28 Ni 58.71 +Copper 29 Cu 63.54 +Zinc 30 Zn 65.37 +Gallium 31 Ga 69.72 +Germanium 32 Ge 72.59 +Arsenic 33 As 74.99 +Selenium 34 Se 78.96 +Bromine 35 Br 79.91 +Krypton 36 Kr 83.8 +Rubidium 37 Rb 85.47 +Strontium 38 Sr 87.62 +Yttrium 39 Y 88.91 +Zirconium 40 Zr 91.22 +Niobium 41 Nb 92.91 +Molybdenum 42 Mo 95.94 +Technetium 43 Tc 96.91 +Ruthenium 44 Ru 101.07 +Rhodium 45 Rh 102.9 +Palladium 46 Pd 106.4 +Silver 47 Ag 107.87 +Cadmium 48 Cd 112.4 +Indium 49 In 114.82 +Tin 50 Sn 118.69 +Antimony 51 Sb 121.75 +Tellurium 52 Te 127.6 +Iodine 53 I 126.9 +Xenon 54 Xe 131.3 +Cesium 55 Cs 132.9 +Barium 56 Ba 137.34 +Lanthanum 57 La 138.91 +Cerium 58 Ce 140.12 +Praseodymium 59 Pr 140.91 +Neodymium 60 Nd 144.24 +Promethium 61 Pm 144.91 +Samarium 62 Sm 150.35 +Europium 63 Eu 151.96 +Gadolinium 64 Gd 157.25 +Terbium 65 Tb 158.92 +Dysprosium 66 Dy 162.5 +Holmium 67 Ho 164.93 +Erbium 68 Er 167.26 +Thulium 69 Tm 168.93 +Ytterbium 70 Yb 173.04 +Lutetium 71 Lu 174.97 +Hafnium 72 Hf 178.49 +Tantalum 73 Ta 180.95 +Wolfram 74 W 183.85 +Rhenium 75 Re 186.2 +Osmium 76 Os 190.2 +Iridium 77 Ir 192.22 +Platinum 78 Pt 195.09 +Gold 79 Au 196.97 +Mercury 80 Hg 200.59 +Thallium 81 Tl 204.37 +Lead 82 Pb 207.19 +Bismuth 83 Bi 208.98 +Polonium 84 Po 208.98 +Astatine 85 At 209.99 +Radon 86 Rn 222.02 +Francium 87 Fr 223.02 +Radium 88 Ra 226 +Actinium 89 Ac 227.03 +Thorium 90 Th 232.04 +Protactinium 91 Pa 231.04 +Uranium 92 U 238.03 +Neptunium 93 Np 237 +Plutonium 94 Pu 242 +Americium 95 Am 243.06 +Curium 96 Cm 247.07 +Berkelium 97 Bk 247.07 +Californium 98 Cf 251.08 +Einsteinium 99 Es 254.09 +Fermium 100 Fm 257.1 +Mendelevium 101 Md 257.1 +Nobelium 102 No 255.09 +Lawrencium 103 Lr 256.1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/Copy of metaMS_cmd_interface.r Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,65 @@ +## =========== This is a copy that can be executed in R environment directly for interactive tests + +## read args: +args <- "test" + +## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData" +args.constructedDB <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/LCDBtest.RData" +## data files, e.g. "E:/Rworkspace/metaMS/data/data.zip" (with e.g. .CDF files) and unzip output dir, e.g. "E:/" +args.dataZip <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/extdata.zip" +args.zipExtrDir <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/zipOut" +## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" +args.settings <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/example_settings.txt" + +## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" +args.outAnnotationTable <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/out/annotationTable.txt" +args.outLogFile <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/out/logfile.txt" + +library(metaMS) + +## load the constructed DB : +tempEnv <- new.env() +testDB <- load(args.constructedDB, envir=tempEnv) + +## load the data files from a zip file +files <- unzip(args.dataZip, exdir=args.zipExtrDir) + +## load settings "script" into "customMetaMSsettings" +source(args.settings, local=tempEnv) + + +# Just to highlight: if you want to use more than one +# trigger runLC: +LC <- runLC(files, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, nSlaves=20, returnXset = TRUE) + +# write out runLC annotation results: +write.table(LC$Annotation$annotation.table, args.outAnnotationTable, sep="\t", row.names=FALSE) + +# the used constructed DB (write to log): +sink( args.outLogFile ) +cat("Constructed DB info===============:") +str(tempEnv[[testDB[1]]]$Info) +cat("Constructed DB table===============:") +sink() +write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE) +write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE) + +# report +LC$xset@xcmsSet +gt <- groups(LC$xset@xcmsSet) +colnames(gt) +groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3) +if (length(groupidx1) > 0) +{ + eiccor <- getEIC(LC$xset@xcmsSet, groupidx = c(groupidx1)) + eicraw <- getEIC(LC$xset@xcmsSet, groupidx = c(groupidx1), rt = "raw") + for (i in 1:length(groupidx1)) + { + png( paste("E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/out/outplot", i,".png", sep="") ) + plot(eiccor, LC$xset@xcmsSet, groupidx = i) + devname = dev.off() + } + +} + +
--- a/test/__init__.py Thu Jan 16 13:22:38 2014 +0100 +++ b/test/__init__.py Fri Oct 24 12:52:56 2014 +0200 @@ -1,1 +1,1 @@ -''' BRS GCMS Galaxy Tools Module ''' +''' unit tests '''
--- a/test/integration_tests.py Thu Jan 16 13:22:38 2014 +0100 +++ b/test/integration_tests.py Fri Oct 24 12:52:56 2014 +0200 @@ -119,7 +119,7 @@ #Build up arguments and run input_txt = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt") - library = resource_filename(__name__, "data/integration/Library_RI_DB_capillary_columns-noDuplicates.txt") + library = resource_filename(__name__, "../repositories/PRIMS-metabolomics/RI_DB_libraries/Library_RI_DB_capillary_columns-noDuplicates.txt") regress_model = resource_filename(__name__, "data/integration/regression_MODEL_for_columns.txt") sys.argv = ['test', library,
--- a/test/test_export_to_metexp_tabular.py Thu Jan 16 13:22:38 2014 +0100 +++ b/test/test_export_to_metexp_tabular.py Fri Oct 24 12:52:56 2014 +0200 @@ -10,6 +10,27 @@ class IntegrationTest(unittest.TestCase): + def test_MM_calculations(self): + ''' + test the implemented method for MM calculations for + given chemical formulas + ''' + export_to_metexp_tabular.init_elements_and_masses_map() + + formula = "C8H18O3" + # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 + result = export_to_metexp_tabular.get_molecular_mass(formula) + self.assertEqual(162.26, result) + + formula = "CH2O3Fe2Ni" + # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 + result = export_to_metexp_tabular.get_molecular_mass(formula) + self.assertAlmostEqual(232.44, result, 2) + + + + + def test_combine_output_simple(self): ''' comment me @@ -28,7 +49,13 @@ sys.argv = ['test', rankfilter_and_caslookup_combined_file, msclust_quantification_and_spectra_file, - output_csv] + output_csv, + 'tomato', + 'leafs', + 'test experiment', + 'pieter', + 'DB5 column'] + # Execute main function with arguments provided through sys.argv export_to_metexp_tabular.main()
--- a/test/test_library_lookup.py Thu Jan 16 13:22:38 2014 +0100 +++ b/test/test_library_lookup.py Fri Oct 24 12:52:56 2014 +0200 @@ -27,7 +27,7 @@ Tests the 'create_lookup_table' function ''' column_type = 'Capillary' - polarity = 'Semi-standard non-polar' + polarity = 'Semi-standard non-polar6' lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity) self.assertFalse(False in [res[4] == 'Capillary' for res in lookup_dict['4177166']]) self.assertEqual(['C51276336', '2,6-Dimethyl-octa-1,7-dien-3,6-diol', 'C10H18O2', @@ -147,9 +147,9 @@ ''' Tests the match_library.py functionality ''' - riqc_libs_dir = resource_filename(__name__, "../repositories") + riqc_libs_dir = resource_filename(__name__, "../repositories/PRIMS-metabolomics/RI_DB_libraries") get_library_files_output = match_library.get_directory_files(riqc_libs_dir) - self.assertEqual(4, len(get_library_files_output)) + self.assertEqual(2, len(get_library_files_output)) self.assertEqual("Library_RI_DB_capillary_columns-noDuplicates", get_library_files_output[0][0]) #TODO change assert below to assert that the result is a file, so the test can run on other dirs as well: #self.assertEqual("E:\\workspace\\PRIMS-metabolomics\\python-tools\\tools\\GCMS\\test\\data\\riqc_libs\\RI DB library (capillary columns) Dec.2012.txt", get_library_files_output[0][1])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_mass_repos.py Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,62 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from MS import query_mass_repos +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + + + + def test_simple(self): + ''' + Simple initial test + ''' + # Create out folder + outdir = "output/query_mass_repos/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # repository_file = sys.argv[3] + # mass_tolerance = float(sys.argv[4]) + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/service_query_tabular.txt") + + molecular_mass_col = "mass (Da)" + dblink_file = resource_filename(__name__, "data/MFSearcher ExactMassDB service.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt") + + + + + sys.argv = ['test', + input_file, + molecular_mass_col, + dblink_file, + '0.001', + 'ms', + output_result] + + # Execute main function with arguments provided through sys.argv + query_mass_repos.main() + + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_metexp.py Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,82 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import query_metexp +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + +# def test_MM_calculations(self): +# ''' +# test the implemented method for MM calculations for +# given chemical formulas +# ''' +# export_to_metexp_tabular.init_elements_and_masses_map() +# +# formula = "C8H18O3" +# # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertEqual(162.26, result) +# +# formula = "CH2O3Fe2Ni" +# # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertAlmostEqual(232.44, result, 2) +# +# +# + + + def test_simple(self): + ''' + Simple initial test + ''' + # Create out folder + outdir = "output/metexp_query/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # formula_col = sys.argv[3] + # metexp_dblink_file = sys.argv[4] + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/metexp_query_tabular.txt") + casid_col = "CAS" + formula_col = "FORMULA" + molecular_mass_col = "MM" + metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt") + + sys.argv = ['test', + input_file, + casid_col, + formula_col, + molecular_mass_col, + metexp_dblink_file, + 'GC', + output_result] + + # Execute main function with arguments provided through sys.argv + query_metexp.main() + + # TODO - asserts (base them on DB being filled with test data form metexp unit test for upload method) + # PA + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_metexp_LARGE.py Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,79 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import query_metexp +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + +# def test_MM_calculations(self): +# ''' +# test the implemented method for MM calculations for +# given chemical formulas +# ''' +# export_to_metexp_tabular.init_elements_and_masses_map() +# +# formula = "C8H18O3" +# # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertEqual(162.26, result) +# +# formula = "CH2O3Fe2Ni" +# # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertAlmostEqual(232.44, result, 2) +# +# +# + + + def test_large(self): + ''' + Simple test, but on larger set, last test executed in 28s + ''' + # Create out folder + outdir = "output/metexp_query/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # formula_col = sys.argv[3] + # metexp_dblink_file = sys.argv[4] + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/metexp_query_tabular_large.txt") + casid_col = "CAS" + formula_col = "FORMULA" + molecular_mass_col = "MM" + metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added_LARGE.txt") + + sys.argv = ['test', + input_file, + casid_col, + formula_col, + molecular_mass_col, + metexp_dblink_file, + 'GC', + output_result] + + # Execute main function with arguments provided through sys.argv + query_metexp.main() + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xcms_differential_analysis.r Fri Oct 24 12:52:56 2014 +0200 @@ -0,0 +1,73 @@ +## read args: +args <- commandArgs(TRUE) +#cat("args <- \"\"\n") +## a xcms xset saved as .RData +args.xsetData <- args[1] +#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep="")) + +args.class1 <- args[2] +args.class2 <- args[3] +#cat(paste("args.class1 <- \"", args[2], "\"\n", sep="")) +#cat(paste("args.class2 <- \"", args[3], "\"\n", sep="")) + +args.topcount <- strtoi(args[4]) +#cat(paste("args.topcount <- ", args[4], "\n", sep="")) + +args.outTable <- args[5] +args.outLogFile <- args[6] +#cat(paste("args.outLogFile <- \"", args[6], "\"\n", sep="")) + +## report files +args.htmlReportFile <- args[7] +args.htmlReportFile.files_path <- args[8] +#cat(paste("args.htmlReportFile <- \"", args[7], "\"\n", sep="")) +#cat(paste("args.htmlReportFile.files_path <- \"", args[8], "\"\n", sep="")) + +# Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 +msg <- file(args.outLogFile, open="wt") +sink(msg, type="message") +sink(msg, type="output") + +tryCatch( + { + library(xcms) + #library("R2HTML") + + ## load the constructed DB : + xcmsSet <- readRDS(args.xsetData) + + # info: levels(xcmsSet@phenoData$class) also gives access to the class names + + reporttab <- diffreport(xcmsSet, args.class1, args.class2, args.htmlReportFile.files_path, args.topcount, metlin = 0.15, h=480, w=640) + + # write out tsv table: + write.table(reporttab, args.outTable, sep="\t", row.names=FALSE) + + message("\nGenerating report.........") + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE) + # setwd(file.path(args.htmlReportFile.files_path)) + + cat("<html><body><h1>Differential analysis report</h1>", file= args.htmlReportFile) + #HTML(reporttab[1:args.topcount,], file= args.htmlReportFile) + figuresPath <- paste(args.htmlReportFile.files_path, "_eic", sep="") + message(figuresPath) + listOfFiles <- list.files(path = figuresPath) + for (i in 1:length(listOfFiles)) + { + figureName <- listOfFiles[i] + # maybe we still need to copy the figures to the args.htmlReportFile.files_path + cat(paste("<img src='fig_eic/", figureName,"' />", sep=""), file= args.htmlReportFile, append=TRUE) + } + + message("finished generating report") + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) \ No newline at end of file