Mercurial > repos > pieterlukasse > prims_metabolomics

Binary file MsClust.jar has changed
--- a/README.rst	Thu Jan 16 13:22:38 2014 +0100
+++ b/README.rst	Fri Oct 24 12:52:56 2014 +0200
@@ -19,6 +19,22 @@
 ============== ======================================================================
 Date            Changes
 -------------- ----------------------------------------------------------------------
+September 2014 * Added new membership cutoff option for final clusters in MsClust
+               * Improved MsClust memory usage for large datasets
+               * Simplified MsClust HTML report
+               * Added option for microminutes based clustering instead of scannr
+                 based in MsClust
+April 2014     * Added interface to ExactMassDB, Pep1000, KEGG, KNApSAcK, Flavonoid
+                 Viewer, LipidMAPS, HMDB, PubChem, by using the service MFSearcher.
+                 This enables users to query multiple public repositories for
+                 elemental compositions from accurate mass values detected by
+                 high-resolution mass spectrometers. NB: see also added
+                 licensing info below.
+March 2014     * Added interface to METEXP data store, including tool to fire
+                 queries in batch mode
+               * Improved quantification output files of MsClust, a.o. sorting
+                 mass list based on intensity (last two columns of quantification
+                 files)
 January 2014   * first release via Tool Shed, combining the RIQC and MsClust in a
                  single package (this package)
                * integration with METEXP software (data store for metabolomics
@@ -68,4 +84,17 @@
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-
\ No newline at end of file
+
+
+License for third party services
+================================
+MFSearcher service : http://webs2.kazusa.or.jp/mfsearcher/#090
+In the MFSearcher system, the compound data provided by KEGG, Flavonoid Viewer, LIPID MAPS, HMDB and PubChem
+were downloaded for academic purposes. The compound data of KNApSAcK is provided by Prof. Kanaya in
+Nara Institute of Science and Technology (NAIST). The part of these data are utilized to construct the
+specified databases for rapid mass searching in the MFSearcher system after re-calculating the molecular weights.
+Please preserve the contracts of each original databases when utilizing the search results against these
+databases by MFSearcher.
+
+The searching system of MFSearcher, the ExactMassDB database, and the Pep1000 database by Kazusa DNA
+Research Institute is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported License.
\ No newline at end of file
--- a/__init__.py	Thu Jan 16 13:22:38 2014 +0100
+++ b/__init__.py	Fri Oct 24 12:52:56 2014 +0200
@@ -1,6 +1,6 @@
-'''
-Module containing Galaxy tools for the GC/MS pipeline
-Created on Mar 6, 2012
-
-@author: marcelk
-'''
+'''
+Module containing Galaxy tools for the LC or GC/MS pipeline
+Created on Mar , 2014
+
+@author: pieter lukasse
+'''
\ No newline at end of file
--- a/combine_output.py	Thu Jan 16 13:22:38 2014 +0100
+++ b/combine_output.py	Fri Oct 24 12:52:56 2014 +0200
@@ -155,12 +155,16 @@
     @param data: dictionary containing merged dataset
     @param out_csv: output csv file
     '''
-    header = ['Centrotype',
+    # Columns we don't repeat:
+    header_part1 = ['Centrotype',
               'cent.Factor',
               'scan nr.',
               'R.T. (umin)',
               'nr. Peaks',
-              'R.T.',
+              'R.T.']
+    # These are the headers/columns we repeat in case of
+    # combining hits in one line (see alternative_headers method below):
+    header_part2 = [
               'Name',
               'FORMULA',
               'Library',
@@ -190,13 +194,21 @@
     output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")

     # Write headers
-    output_single_handle.writerow(header)
-    output_multi_handle.writerow(header * nhits)
+    output_single_handle.writerow(header_part1 + header_part2)
+    output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
     # Combine all hits for each centrotype into one line
     line = []
     for centrotype_idx in xrange(len(data)):
+        i = 0
         for hit in data[centrotype_idx]:
-            line.extend(hit)
+            if i==0:
+                line.extend(hit)
+            else:
+                line.extend(hit[6:])
+            i = i+1
+        # small validation (if error, it is a programming error):
+        if i > nhits:
+            raise Exception('Error: more hits that expected for  centrotype_idx ' + centrotype_idx)
         output_multi_handle.writerow(line)
         line = []

@@ -205,6 +217,17 @@
         for hit in data[centrotype_idx]:
             output_single_handle.writerow(hit)

+def alternative_headers(header_part2, nr_alternative_hits):
+    '''
+    This method will iterate over the header names and add the string 'ALT#_' before each,
+    where # is the number of the alternative, according to number of alternative hits we want to add
+    to final csv/tsv
+    '''
+    result = []
+    for i in xrange(nr_alternative_hits):
+        for header_name in header_part2:
+            result.append("ALT" + str(i+1) + "_" + header_name)
+    return result

 def main():
     '''
--- a/combine_output.xml	Thu Jan 16 13:22:38 2014 +0100
+++ b/combine_output.xml	Fri Oct 24 12:52:56 2014 +0200
@@ -4,10 +4,11 @@
     combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi
   </command>
   <inputs>
-    <param format="tabular" name="caslookup_in" type="data" label="RIQC-Lookup RI for CAS output"
+    <param format="tabular" name="rankfilter_in" type="data" label="RIQC-RankFilter output (Estimated RI)"
+    	help="Select the output file from the RankFilter tool"/>
+    <param format="tabular" name="caslookup_in" type="data" label="RIQC-Lookup RI for CAS output ('Known' RI)"
     	help="Select the output file from the CasLookup tool"/>
-    <param format="tabular" name="rankfilter_in" type="data" label="RIQC-RankFilter output"
-    	help="Select the output file from the RankFilter tool"/>
+    <!--  <param TODO : could add "tolerance for ERI-KRI"(Estimated RI-Known RI)-->
   </inputs>
   <outputs>
     <data format="tabular" label="${tool.name} (Single) on ${on_string}" name="out_single" />
--- a/datatypes_conf.xml	Thu Jan 16 13:22:38 2014 +0100
+++ b/datatypes_conf.xml	Fri Oct 24 12:52:56 2014 +0200
@@ -3,9 +3,6 @@
   <datatype_files>
   </datatype_files>
   <registration display_path="display_applications">
-        <!-- type for the pdf -->
-        <datatype extension="pdf"  type="galaxy.datatypes.data:Data" mimetype="application/octet-stream"
-        display_in_upload="true" subclass="true"/>
         <datatype extension="msclust.csv" type="galaxy.datatypes.tabular:Tabular" mimetype="text/csv" display_in_upload="true" subclass="true">
         </datatype>
   </registration>
--- a/export_to_metexp_tabular.py	Thu Jan 16 13:22:38 2014 +0100
+++ b/export_to_metexp_tabular.py	Fri Oct 24 12:52:56 2014 +0200
@@ -5,17 +5,18 @@
 into a tabular file that can be uploaded to the MetExp database.

 RankFilter, CasLookup are already combined by combine_output.py so here we will use
-this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
-quantification files are to be combined with combine_output.py result as well.
+this result. Furthermore here one of the MsClust
+quantification files containing the respective spectra details are to be combined as well.

 Extra calculations performed:
 - The column MW is also added here and is derived from the column FORMULA found
-  in combine_output.py result.
+  in RankFilter, CasLookup combined result.

-So in total here we merge 3 files and calculate one new column.
+So in total here we merge 2 files and calculate one new column.
 '''
-
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
 import csv
+import re
 import sys
 from collections import OrderedDict

@@ -40,14 +41,15 @@
 ONE_TO_ONE = 'one_to_one'
 N_TO_ONE = 'n_to_one'

-def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE):
+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):
     '''
     Merges data from both input dictionaries based on the link fields. This method will
     build up a new list containing the merged hits as the items.
     @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
     @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
     '''
-    # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values):
+    # TODO test for correct input files -> same link_field values should be there
+    # (test at least number of unique link_field values):
     #
     # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
     #    raise Exception('input files should have the same nr of key values  ')
@@ -64,17 +66,23 @@
             # Get the indices for current link_field_set1_value in both data-structures for proper matching
             set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
             set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
-
-
+            # Validation :
+            if len(set2index) == 0:
+                # means that corresponding data could not be found in set2, then throw error
+                raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" +
+                                link_field_set1_value + " only found in first dataset. ")

             merged_hits = []
             # Combine hits
             for hit in xrange(len(set1index)):
                 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
                 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
-                # corresponding value in the rankfilter or caslookup tables; i.e.
-                # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
-                #                    represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
+                # corresponding value in the sets; i.e.
+                # set1[key] => returns the list/array with size = nrrows, with the values for the attribute
+                #                    represented by "key".
+                # set1index[hit] => points to the row nr=hit (hit is a rownr/index)
+                # So set1[x][set1index[n]] = set1.attributeX.instanceN
+                #
                 # It just ensures the entry is made available as a plain named array for easy access.
                 rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
                 if relation_type == ONE_TO_ONE :
@@ -83,7 +91,7 @@
                     # is N to 1:
                     cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))

-                merged_hit = merge_function(rf_record, cl_record)
+                merged_hit = merge_function(rf_record, cl_record, metadata)
                 merged_hits.append(merged_hit)

             merged.append(merged_hits)
@@ -103,29 +111,62 @@


-def _merge_records(rank_caslookup_combi, msclust_quant_record):
+def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata):
     '''
     Combines single records from both the RankFilter+CasLookup combi file and from MsClust file

     @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
     @param msclust_quant_record: msclust quantification + spectrum record
     '''
-    i = 0
     record = []
     for column in rank_caslookup_combi:
         record.append(rank_caslookup_combi[column])
-        i += 1

     for column in msclust_quant_record:
         record.append(msclust_quant_record[column])
-        i += 1
+
+    for column in metadata:
+        record.append(metadata[column])
+
+    # add MOLECULAR MASS (MM)
+    molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA'])
+    # limit to two decimals:
+    record.append("{0:.2f}".format(molecular_mass))
+
+    # add MOLECULAR WEIGHT (MW) - TODO - calculate this
+    record.append('0.0')
+
+    # level of identification and Location of reference standard
+    record.append('0')
+    record.append('')

     return record


-
+def get_molecular_mass(formula):
+    '''
+    Calculates the molecular mass (MM).
+    E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O
+    '''
+
+    # Each element is represented by a capital letter, followed optionally by
+    # lower case, with one or more digits as for how many elements:
+    element_pattern = re.compile("([A-Z][a-z]?)(\d*)")

-def _save_data(data, headers, nhits, out_csv):
+    total_mass = 0
+    for (element_name, count) in element_pattern.findall(formula):
+        if count == "":
+            count = 1
+        else:
+            count = int(count)
+        element_mass = float(elements_and_masses_map[element_name])  # "found: Python's built-in float type has double precision " (? check if really correct ?)
+        total_mass += element_mass * count
+
+    return total_mass
+
+
+
+def _save_data(data, headers, out_csv):
     '''
     Writes tab-separated data to file
     @param data: dictionary containing merged dataset
@@ -139,12 +180,35 @@
     # Write headers
     output_single_handle.writerow(headers)

-    # Write one line for each centrotype
-    for centrotype_idx in xrange(len(data)):
-        for hit in data[centrotype_idx]:
+    # Write
+    for item_idx in xrange(len(data)):
+        for hit in data[item_idx]:
             output_single_handle.writerow(hit)


+def _get_map_for_elements_and_masses(elements_and_masses):
+    '''
+    This method will read out the column 'Chemical symbol' and make a map
+    of this, storing the column 'Relative atomic mass' as its value
+    '''
+    resultMap = {}
+    index = 0
+    for entry in elements_and_masses['Chemical symbol']:
+        resultMap[entry] = elements_and_masses['Relative atomic mass'][index]
+        index += 1
+
+    return resultMap
+
+
+def init_elements_and_masses_map():
+    '''
+    Initializes the lookup map containing the elements and their respective masses
+    '''
+    elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))
+    global elements_and_masses_map
+    elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)
+
+
 def main():
     '''
     Combine Output main function
@@ -156,15 +220,27 @@
     rankfilter_and_caslookup_combined_file = sys.argv[1]
     msclust_quantification_and_spectra_file = sys.argv[2]
     output_csv = sys.argv[3]
+    # metadata
+    metadata = OrderedDict()
+    metadata['organism'] = sys.argv[4]
+    metadata['tissue'] = sys.argv[5]
+    metadata['experiment_name'] = sys.argv[6]
+    metadata['user_name'] = sys.argv[7]
+    metadata['column_type'] = sys.argv[8]

     # Read RankFilter and CasLookup output files
     rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
     msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')

+    # Read elements and masses to use for the MW/MM calculation :
+    init_elements_and_masses_map()
+
     merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype',
-                                msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE)
-    headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys()
-    _save_data(merged, headers, nhits, output_csv)
+                                msclust_quantification_and_spectra, 'centrotype',
+                                _compare_records, _merge_records, metadata,
+                                N_TO_ONE)
+    headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard']
+    _save_data(merged, headers, output_csv)


 if __name__ == '__main__':
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/export_to_metexp_tabular.xml	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,75 @@
+<tool id="export_to_metexp_tabular"
+    name="METEXP - Tabular file"
+    version="0.2.0">
+  <description>Create tabular file for loading into METabolomics EXPlorer database</description>
+  <command interpreter="python">
+    export_to_metexp_tabular.py
+    $rankfilter_and_caslookup_combi
+    $msclust_quant_file
+    $output_result
+    "$organism"
+    "$tissue"
+    "$experiment_name"
+    "$user_name"
+    "$column_type"
+  </command>
+  <inputs>
+    <param format="tabular" name="rankfilter_and_caslookup_combi" type="data" label="RIQC-Combine RankFilter and CasLookup output"
+    	help="Select the (multi) output file from the 'Combine RankFilter and CasLookup' tool"/>
+    <param format="tabular" name="msclust_quant_file" type="data" label="MusClust-quantification file output"
+    	help="Select the output file from MsClust (centrotype, mic or sim) which also contain respective spectrum details"/>
+
+
+   <param name="organism" type="text" size="80"
+           label="Organism(s) info"
+           help="Metadata information to accompany the results when stored in MetExp DB." >
+           <validator type="empty_field" message="A value is required."></validator><!-- attribute optional="False" does not seem to work for params so validator is added -->
+    </param>
+
+   <param name="tissue" type="text" size="80"
+           label="Tissue(s) info"
+           help="Metadata information to accompany the results when stored in MetExp DB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
+
+   <param name="experiment_name" type="text" size="80"
+           label="Experiment name/code"
+           help="Name or code to store the results under. This can help you find the results back in MetExpDB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
+
+   <param name="user_name" type="text" size="80"
+           label="User name"
+           help="User name or code to store the results under. This can help you find the results back in MetExpDB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
+
+    <param name="column_type" type="text" size="80"
+           label="Column type"
+           help="Column type to report with the results. This can help you find the results back in MetExpDB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
+
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} on ${on_string}" name="output_result" />
+  </outputs>
+  <help>
+.. class:: infomark
+
+Tool to combine output from the tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database.
+
+RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use
+this result.
+
+**Notes**
+
+Extra calculations performed:
+- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result.
+
+So in total here we merge 2 files and calculate one new column.
+
+
+  </help>
+</tool>
Binary file juniversalchardet-1.0.3.jar has changed
--- a/library_lookup.xml	Thu Jan 16 13:22:38 2014 +0100
+++ b/library_lookup.xml	Fri Oct 24 12:52:56 2014 +0200
@@ -13,16 +13,22 @@
     $regression.model
   </command>
   <inputs>
+  <!-- Regarding the <page> items: this blocks the use of this tool in Galaxy workflows. However,
+       alternatives like wrapping this in conditionals, repeats (to force a refresh_on_change as this option
+       is not working on its own) failed since the workflow editor does not support refreshes...not does the
+       workflow runtime support conditionals or repeats to be set at runtime. See also
+       galaxy-dev mail thread "when else" in <conditional> ? RE: refresh_on_change : is this a valid attribute? Any other ideas/options??"  -->
     <page>
       <param format="tabular" name="input" type="data" label="NIST identifications as tabular file"
       		 help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" />
       <param name="library_file" type="select" label="CAS x RI Library file"
       		 help="Select a library/lookup file containing RI values for CAS numbers on various chromatography columns "
       		 dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/>
+    </page>
+    <page>
       <param name="col_type" type="select" label="Select column type" refresh_on_change="true"
 	         display="radio" dynamic_options='get_column_type(library_file)'
 	         help="" />
-    </page>
     <page>
       <param name="polarity" type="select" label="Select polarity" refresh_on_change="true"
              display="radio" dynamic_options='filter_column(library_file,col_type)'
--- a/match_library.py	Thu Jan 16 13:22:38 2014 +0100
+++ b/match_library.py	Fri Oct 24 12:52:56 2014 +0200
@@ -17,15 +17,19 @@
     GC-column types. Used by the library_lookup.xml tool
     @param library_file: given library file from which the list of GC-column types is extracted
     '''
-    (data, header) = read_library(library_file)
-
-    if 'columntype' not in header:
-        raise IOError('Missing columns in ', library_file)
-
-    # Filter data on column type
-    column_type = header.index("columntype")
-    amounts_in_list_dict = count_occurrence([row[column_type] for row in data])
-    galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()]
+    if library_file == "":
+        galaxy_output = [("", "", False)]
+    else:
+        (data, header) = read_library(library_file)
+
+        if 'columntype' not in header:
+            raise IOError('Missing columns in ', library_file)
+
+        # Filter data on column type
+        column_type = header.index("columntype")
+        amounts_in_list_dict = count_occurrence([row[column_type] for row in data])
+        galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()]
+
     return(galaxy_output)


@@ -35,19 +39,23 @@
     @param library_file: file containing the database
     @param column_type_name: column type to filter on
     '''
-    (data, header) = read_library(library_file)
-
-    if ('columntype' not in header or
-        'columnphasetype' not in header):
-        raise IOError('Missing columns in ', library_file)
-
-    column_type = header.index("columntype")
-    statphase = header.index("columnphasetype")
-
-    # Filter data on colunn type name
-    statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name]
-    amounts_in_list_dict = count_occurrence(statphase_list)
-    galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
+    if library_file == "":
+        galaxy_output = [("", "", False)]
+    else:
+        (data, header) = read_library(library_file)
+
+        if ('columntype' not in header or
+            'columnphasetype' not in header):
+            raise IOError('Missing columns in ', library_file)
+
+        column_type = header.index("columntype")
+        statphase = header.index("columnphasetype")
+
+        # Filter data on colunn type name
+        statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name]
+        amounts_in_list_dict = count_occurrence(statphase_list)
+        galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
+
     return(sorted(galaxy_output))


@@ -58,22 +66,26 @@
     @param column_type_name: column type to filter on
     @param statphase: stationary phase of the column to filter on
     '''
-    (data, header) = read_library(library_file)
-
-    if ('columntype' not in header or
-        'columnphasetype' not in header or
-        'columnname' not in header):
-        raise IOError('Missing columns in ', library_file)
-
-    column_type_column = header.index("columntype")
-    statphase_column = header.index("columnphasetype")
-    column_name_column = header.index("columnname")
-
-    # Filter data on given column type name and stationary phase
-    statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and
-                      line[statphase_column] == statphase]
-    amounts_in_list_dict = count_occurrence(statphase_list)
-    galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
+    if library_file == "":
+        galaxy_output = [("", "", False)]
+    else:
+        (data, header) = read_library(library_file)
+
+        if ('columntype' not in header or
+            'columnphasetype' not in header or
+            'columnname' not in header):
+            raise IOError('Missing columns in ', library_file)
+
+        column_type_column = header.index("columntype")
+        statphase_column = header.index("columnphasetype")
+        column_name_column = header.index("columnname")
+
+        # Filter data on given column type name and stationary phase
+        statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and
+                          line[statphase_column] == statphase]
+        amounts_in_list_dict = count_occurrence(statphase_list)
+        galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
+
     return(sorted(galaxy_output))


@@ -96,9 +108,10 @@
     fill a Galaxy drop-down combo box.

     '''
-    files = glob.glob(dir_name + "/*.txt")
+    files = glob.glob(dir_name + "/*.*")
     if len(files) == 0:
-        raise Exception("Configuration error: no library files found in <galaxy-home-dir>/" + dir_name)
+        # Configuration error: no library files found in <galaxy-home-dir>/" + dir_name :
+        galaxy_output = [("Configuration error: expected file not found in <galaxy-home-dir>/" + dir_name, "", False)]
     else:
         galaxy_output = [(str(get_file_name_no_ext(file_name)), str(os.path.abspath(file_name)), False) for file_name in files]
     return(galaxy_output)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS_cmd_interface.r	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,99 @@
+## read args:
+args <- commandArgs(TRUE)
+## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData"
+args.constructedDB <- args[1]
+## data files, e.g. "E:/Rworkspace/metaMS/data/data.zip" (with e.g. .CDF files) and unzip output dir, e.g. "E:/"
+args.dataZip <- args[2]
+args.zipExtrDir <- args[3]
+## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings"
+args.settings <- args[4]
+
+## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt"
+args.outAnnotationTable <- args[5]
+args.outLogFile <- args[6]
+args.xsetOut <- args[7]
+
+## report files
+args.htmlReportFile <- args[8]
+args.htmlReportFile.files_path <- args[9]
+
+# Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+msg <- file(args.outLogFile, open="wt")
+sink(msg, type="message")
+sink(msg, type="output")
+
+cat("\nSettings used===============:\n")
+cat(readChar(args.settings, 1e5))
+
+
+tryCatch(
+        {
+	        library(metaMS)
+
+			## load the constructed DB :
+			tempEnv <- new.env()
+			testDB <- load(args.constructedDB, envir=tempEnv)
+
+			## load the data files from a zip file
+			files <- unzip(args.dataZip, exdir=args.zipExtrDir)
+
+			## load settings "script" into "customMetaMSsettings"
+			source(args.settings, local=tempEnv)
+			message(paste(" loaded : ", args.settings))
+
+			# Just to highlight: if you want to use more than one
+			# trigger runLC:
+			LC <- runLC(files, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, nSlaves=20, returnXset = TRUE)
+
+			# write out runLC annotation results:
+			write.table(LC$Annotation$annotation.table, args.outAnnotationTable, sep="\t", row.names=FALSE)
+
+			# the used constructed DB (write to log):
+			cat("\nConstructed DB info===============:\n")
+			str(tempEnv[[testDB[1]]]$Info)
+			cat("\nConstructed DB table===============:\n")
+			write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE)
+			write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE)
+			# save xset as rdata:
+			xsetData <- LC$xset@xcmsSet
+			saveRDS(xsetData, file=args.xsetOut)
+
+			message("\nGenerating report.........")
+			# report
+			dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE)
+			setwd(file.path(args.htmlReportFile.files_path))
+			html <- "<html><body><h1>Extracted Ion Chromatograms of groups with more than 3 peaks</h1>"
+
+			LC$xset@xcmsSet
+			gt <- groups(LC$xset@xcmsSet)
+			colnames(gt)
+			groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3)
+			if (length(groupidx1) > 0)
+			{
+				eiccor <- getEIC(LC$xset@xcmsSet, groupidx = c(groupidx1))
+				eicraw <- getEIC(LC$xset@xcmsSet, groupidx = c(groupidx1), rt = "raw")
+				for (i in 1:length(groupidx1))
+				{
+					figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
+					html <- paste(html,"<img src='", "figure", i,".png' />", sep="")
+					png( figureName )
+					plot(eiccor, LC$xset@xcmsSet, groupidx = i)
+					devname = dev.off()
+				}
+			}
+
+
+			html <- paste(html,"</body><html>")
+			message("finished generating report")
+			write(html,file=args.htmlReportFile)
+			# unlink(args.htmlReportFile)
+			cat("\nWarnings================:\n")
+			str( warnings() )
+		},
+        error=function(cond) {
+            sink(NULL, type="message") # default setting
+			sink(stderr(), type="output")
+            message("\nERROR: ===========\n")
+            print(cond)
+        }
+    )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metams_lcms_annotate.xml	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,138 @@
+<tool id="metams_lcms_annotate" name="METAMS-LC/MS Annotate"  version="0.0.1">
+	<description> Runs metaMS process for LC/MS feature grouping and annotation</description>
+	<requirements>
+		<!-- <requirement type="set_environment">XTANDEM_12_10_01_PATH</requirement>-->
+		<requirement type="package" version="3.1.1">R_bioc_metams</requirement>
+	</requirements>
+	<command interpreter="/home/lukas007/R/R-3.1.1/bin/Rscript">
+		metaMS_cmd_interface.r
+	    $constructed_db
+	    $data_files
+	    $data_files"dir/"
+	    $customMetaMSsettings
+	    $outputFile
+	    $outputLog
+	    $xsetOut
+	    $htmlReportFile
+	    $htmlReportFile.files_path
+	</command>
+<inputs>
+	<param name="constructed_db" type="select" label="Constructed DB" help="Reference annotation database generated from matching measurements of a mixture of chemical standards
+	against a manually validated reference table which contains the key analytical information for each standard."
+      		 dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/metaMS")'/>
+
+	<param name="data_files" type="data" format="prims.fileset.zip" label="Data files (.zip file with CDFs)" help=".zip file containing the CDF files of the new measurements"/>
+
+
+
+	<param name="protocolName" type="text" size="30" label="protocolName" value="Synapt.QTOF.RP" help="protocolName"/>
+
+	<param name="method" type="select" size="30" label="PEAK PICKING method =====================================================">
+		<option value="matchedFilter" selected="true">matchedFilter</option>
+	</param>
+	<param name="step" type="float" size="10" value="0.05" label="step" help="step"/>
+	<param name="fwhm" type="integer" size="10" value="20" label="fwhm" help="fwhm" />
+	<param name="snthresh" type="integer" size="10" value="4" label="snthresh" help="snthresh" />
+	<param name="max" type="integer" size="10" value="50" label="max" help="max" />
+
+	<param name="min_class_fraction" type="float" size="10" value="0.3" label="ALIGNMENT min.class.fraction =====================================================" help="min.class.fraction"/>
+	<param name="min_class_size" type="integer" size="10" value="3" label="min.class.size" help="min.class.size" />
+	<param name="mzwid" type="float" size="10" value="0.1" label="mzwid" help="mzwid"/>
+	<param name="bws" type="text" size="10" value="30,10" label="bws" help="bws"/>
+	<param name="missingratio" type="float" size="10" value="0.2" label="missingratio" help="missingratio"/>
+	<param name="extraratio" type="float" size="10" value="0.1" label="extraratio" help="extraratio"/>
+	<param name="retcormethod" type="select" size="30" label="retcormethod" help="retcormethod">
+		<option value="linear" selected="true">linear</option>
+	</param>
+	<param name="retcorfamily" type="select" size="30" label="retcorfamily" help="retcorfamily">
+		<option value="symmetric" selected="true">symmetric</option>
+	</param>
+	<param name="fillPeaks" type="select" size="30" label="fillPeaks" help="fillPeaks">
+		<option value="TRUE" selected="true">Yes</option>
+		<option value="FALSE">No</option>
+	</param>
+	<param name="perfwhm" type="float" size="10" value="0.6" label="CAMERA perfwhm =====================================================" help="perfwhm"/>
+	<param name="cor_eic_th" type="float" size="10" value="0.7" label="cor_eic_th" help="cor_eic_th" />
+	<param name="ppm" type="float" size="10" value="5.0" label="ppm" help="ppm" />
+	<param name="rtdiff" type="float" size="10" value="1.5" label="MATCH2DB rtdiff =====================================================" help="rtdiff"/>
+	<param name="rtval" type="float" size="10" value="0.1" label="rtval" help="rtval" />
+	<param name="mzdiff" type="float" size="10" value="0.005" label="mzdiff" help="mzdiff" />
+	<param name="match2DB_ppm" type="float" size="10" value="5.0" label="ppm" help="ppm" />
+	<param name="minfeat" type="integer" size="10" value="2" label="minfeat" help="minfeat" />
+
+</inputs>
+<configfiles>
+
+<configfile name="customMetaMSsettings">## start comment
+		## metaMS process settings
+		customMetaMSsettings &lt;- metaMSsettings(protocolName = "${protocolName}",
+                            chrom = "LC",
+                            PeakPicking = list(
+                              method = "${method}",
+                              step = ${step},
+                              fwhm = ${fwhm},
+                              snthresh = ${snthresh},
+                              max = ${max}),
+                            Alignment = list(
+                              min.class.fraction = ${min_class_fraction},
+                              min.class.size = ${min_class_size},
+                              mzwid = ${mzwid},
+                              bws = c(${bws}),
+                              missingratio = ${missingratio},
+                              extraratio = ${extraratio},
+                              retcormethod = "${retcormethod}",
+                              retcorfamily = "${retcorfamily}",
+                              fillPeaks = ${fillPeaks}),
+                            CAMERA = list(
+                              perfwhm = ${perfwhm},
+                              cor_eic_th = ${cor_eic_th},
+                              ppm= ${ppm}))
+metaSetting(customMetaMSsettings, "match2DB") &lt;- list(
+            rtdiff = ${rtdiff},
+            rtval = ${rtval},
+            mzdiff = ${mzdiff},
+            ppm = ${match2DB_ppm},
+            minfeat = ${minfeat})</configfile>
+
+</configfiles>
+
+<outputs>
+	<data name="outputFile" format="tabular" label="${tool.name} on ${on_string} - metaMS annotated file (TSV)"/>
+	<data name="outputLog" format="txt" label="${tool.name} on ${on_string} - metaMS LOG"/>
+	<data name="xsetOut" format="rdata" label="${tool.name} on ${on_string} - metaMS xcmsSet (RDATA)"/>
+	<data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - metaMS report (HTML)"/>
+</outputs>
+<tests>
+	<test>
+	</test>
+</tests>
+<code file="match_library.py" /> <!-- file containing get_directory_files function used above-->
+<help>
+
+.. class:: infomark
+
+Runs metaMS process for LC/MS feature grouping and annotation. Parts of the metaMS process also make use of the XCMS and CAMERA tools and algorithms.
+The figure below shows the main parts of the metaMS process.
+
+.. image:: $PATH_TO_IMAGES/metaMS.png
+
+
+**References**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following papers:
+
+Wehrens, R.; Weingart, G.; Mattivi, F. (2014).
+metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics.
+Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116.
+doi: 10.1016/j.jchromb.2014.02.051
+handle: http://hdl.handle.net/10449/24012
+
+
+  </help>
+  <citations>
+        <citation type="doi">10.1016/j.jchromb.2014.02.051</citation> <!-- example
+        see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set
+        -->
+   </citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/msclust.xml	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,341 @@
+<tool name="MsClust" id="msclust2" version="2.0.5">
+	<description>Extracts fragmentation spectra from aligned data</description>
+	<!--
+	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
+	       java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000
+	                    //////////////////////////
+
+	       TODO in command below: add conditionals according to options of using or NOT the tolerances/thresholds from previous steps
+	    -->
+	<command interpreter="java -jar ">
+	    MsClust.jar
+	   	-peaksFileName $inputPeaks
+	   	-dataType $dataType
+        -imputationMethod $imputationMethod.type
+        #if $imputationMethod.type == "valueRange"
+        	-rangeUpperLimit $imputationMethod.rangeUpperLimit
+        #end if
+		-plInputFormat "metalign"
+		-potDensFuncType $potDensFuncType.type
+		-centerSelectionType $centerSelectionType.type
+		-clusteringType $clusteringType.type
+		-neighborhoodWindowSize $potDensFuncType.pdf_neighborhoodWindowSize
+		-clusterSearchStopCriterium $centerSelectionType.cs_stop_criterion
+		-pearsonDistTreshold $potDensFuncType.pdf_pears_treshold
+		-pearsonTresholdConfidence $potDensFuncType.pdf_pears_conf
+		-pearsonPDReductionThreshold $centerSelectionType.cs_pears_pd_reductionTreshold
+		-pearsonPDReductionSlope $centerSelectionType.cs_pears_pd_reductionSlope
+		-rtDistTolUnit $potDensFuncType.rt_dist_tol_unit.type
+		-rtDistTol $potDensFuncType.rt_dist_tol_unit.pdf_rt_toler
+		-rtDistanceConfidence $potDensFuncType.pdf_scan_conf
+		#if $clusteringType.type == "original"
+			-clustMembershipCutoff $clusteringType.clust_membership_cutoff
+		#end if
+		-centrotypesOut $centrotypesOut
+		-simOut $simOut
+		-micOut $micOut
+		-mspOut $mspOut
+		-classOut $classOut
+		-outReport $htmlReportFile
+	    -outReportPicturesPath $htmlReportFile.files_path
+        #if $clusteringType.type == "fuzzyCMeans"
+        	-fcmMembershipWeightingExponent $clusteringType.fcmMembershipWeightingExponent
+			-fcmStopCriterion $clusteringType.fcmStopCriterion
+			-fcmCorrelationWeight $clusteringType.fcmCorrelationWeight
+			-fcmFinalAssemblyType $clusteringType.finalClusterAssembly.type
+			#if $clusteringType.finalClusterAssembly.type == "membershipBased"
+				-fcmMembershipCutoff $clusteringType.finalClusterAssembly.fcmMembershipCutoff
+			#end if
+        #end if
+		-verbose "false"
+	    #if $advancedSettings.settings == True
+	    	-advancedSettings YES
+	    	-saturationLimit $advancedSettings.saturationLimit
+	    	-sampleSelectionSortType $advancedSettings.sampleSelectionSortType
+	    	-simSelectionAlgorithm $advancedSettings.simSelectionAlgorithm
+	    	-simMassFilter "$advancedSettings.simMassFilter"
+	    	-simMembershipThreshold $advancedSettings.simMembershipThreshold
+	    	-simSaturationThreshold $advancedSettings.simSaturationThreshold
+	    	-simAbsenseThreshold $advancedSettings.simAbsenseThreshold
+	    	-micMembershipThreshold $advancedSettings.micMembershipThreshold
+	    	-peakIntensityCorrectionAlgorithm $advancedSettings.peakIntensityCorrectionAlgorithm
+        #else
+        	-advancedSettings YES
+        	-sampleSelectionSortType SIM_INTENSITY
+        	-peakIntensityCorrectionAlgorithm CORRELATION_BASED
+        #end if
+
+	</command>
+	<inputs>
+
+	 	<param name="inputPeaks" type="data" format="txt" label="Ion-wise aligned data (e.g. MetAlign output data)" />
+		<param name="dataType" type="select" size="30" label="Data type">
+				<option value="gcms"  selected="true">GC-MS</option>
+				<option value="lcms">LC-MS</option>
+			</param>
+	 	<conditional name="imputationMethod">
+			<param name="type" type="select" size="30" label="Select the approach used for imputing missing values (optional)" help="select how you generated the values to fill in the data gaps">
+				<option value="none" >none</option>
+				<option value="metot" selected="true">MeTot</option>
+				<option value="valueRange">Values range</option>
+			</param>
+			<when value="valueRange">
+				<param name="rangeUpperLimit" type="integer" size="10" value="0" label="Range upper limit" help="values up to this limit will be considered 'generated' values"  />
+			</when>
+			<when value="metot">
+     		</when>
+     		<when value="none">
+     		</when>
+		</conditional>
+	 	<conditional name="potDensFuncType">
+		    <param name="type" type="select" size="30" label="Select PD function type =====================================================">
+		      <option value="original" selected="true">Original</option>
+		    </param>
+		    <when value="original">
+		      <param name="pdf_neighborhoodWindowSize" type="integer" size="10" value="200" label="Effective Peaks"  />
+		      <conditional name="rt_dist_tol_unit">
+		      	<param name="type" type="select" size="30" label="Peak time unit">
+		      		<option value="1" selected="true">scan nr</option>
+		      		<option value="2" >(average) micro minutes</option>
+		      	</param>
+		      	<when value="1">
+		      		<param name="pdf_rt_toler" type="float" size="10" value="10" label="Peak Width, in scans"  />
+		      	</when>
+		      	<when value="2">
+		      		<param name="pdf_rt_toler" type="float" size="10" value="100000" label="Peak Width, in micro minutes" help="e.g. 100,000=6 seconds" />
+		      	</when>
+		      </conditional>
+		      <param name="pdf_scan_conf" type="float" size="10" value="80" label="Peak Width confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" />
+		      <param name="pdf_pears_treshold" type="float" size="10" value="0.8" label="Correlation threshold (0.0 - 1.0)" />
+		      <param name="pdf_pears_conf" type="float" size="10" value="98.0" label="Correlation threshold confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" />
+		    </when>
+		</conditional>
+		<conditional name="centerSelectionType">
+		    <param name="type" type="select" label="Initial Centers selection type ==================================================" >
+		      <option value="original" selected="true">Original - Subtractive potential reductions with stop criterion and REUSE tolerances (from PD function)</option>
+		    </param>
+		    <when value="original">
+		      <param name="cs_pears_pd_reductionTreshold" type="float" size="10" value="0.8" label="Potential Density reduction (0.0 - 1.0)"  />
+		      <param name="cs_pears_pd_reductionSlope" type="float" size="10" value="0.01" label="Potential Density reduction softness "  />
+		      <param name="cs_stop_criterion" type="float" size="10" value="2" label="Stop Criterion "  />
+			</when>
+		</conditional>
+		<conditional name="clusteringType">
+		    <param name="type" type="select" label="Classify using ===========================================================">
+		      <option value="original" selected="true">Original - Fuzzy clustering, keep original centers and REUSE (scan distance) tolerances</option>
+		      <option value="fuzzyCMeans">(experimental) Fuzzy C-Means - Fuzzy clustering, optimize centers</option>
+		    </param>
+		    <when value="original">
+		    	<param name="clust_membership_cutoff" type="float" size="10" value=""
+		    	       label="Membership cutoff (0.0 - 1.0)"
+		    	       help="Items with membership below this value are NOT added to the cluster"/>
+			</when>
+		    <!-- one idea would be to have clustering specific tolerance values, not reusing the centrotype selection ones
+		    <when value="originalNewTol">
+		      <param name="clust_scan_toler" type="float" size="10" value="10" label="Peak Width, in scans"  />
+		      <param name="clust_scan_slope" type="float" size="10" value="2" label="Peak Width margin softness"  />
+		    </when>
+		    -->
+		    <when value="fuzzyCMeans">
+				<param name="fcmMembershipWeightingExponent" type="float" size="10" value="2.0" label="Membership Weighting Exponent" help="Influences cluster center repositioning in the iterations 1.1 (exploratory) to around 3.0 (conservative)" />
+				<param name="fcmStopCriterion" type="float" size="10" value="0.05" label="Stop Criterion" help="When convergence is 'reached' (e.g. 0.05 means memberships only changed with 5% in last iteration)" />
+				<param name="fcmCorrelationWeight" type="float" size="10" value="2" label="Correlation weight factor" help="Increase this if you think the correlation is reliable (e.g. you have a high number of samples)" />
+				<conditional name="finalClusterAssembly">
+					<param name="type" type="select" label="Final cluster assembly" >
+				      <option value="original" selected="true">Original - distance based</option>
+				      <option value="membershipBased">Membership based</option>
+				    </param>
+					<when value="membershipBased">
+						<param name="fcmMembershipCutoff" type="select" label="Maximum allowed peak overlap" >
+							<option value="0.05" >~7 clusters</option>
+							<option value="0.10" >~5 clusters</option>
+							<option value="0.20" >~3 clusters</option>
+						</param>
+					</when>
+					<when value="original">
+					    <!-- nothing -->
+					</when>
+				</conditional>
+		    </when>
+		</conditional>
+
+		<param name="summaryReport" type="boolean" checked="true" label="Generate summary report" help="NB: this will increase the processing time (in some cases up to a few extra minutes)"/>
+
+        <conditional name="advancedSettings">
+     		<param name="settings" type="boolean" truevalue="Yes" falsevalue="No" checked="false" label="Advanced settings ========================================================"/>
+     		<when value="Yes">
+     			<param name="saturationLimit" optional="true" type="integer" size="10" label="Saturation limit (optional)" help="fill in if you have saturation problems in your data"  />
+	 			<param name="sampleSelectionSortType"  type="select" label="Sample selection scheme for spectrum peak intensity correction algorithm (optional/experimental)" help="The intensity values to use to select the samples for each cluster/metabolite in which it is most intense/abundant. These samples are used in the peak intensity correction (see parameter below). Use this option to try to avoid samples that have insufficient signal or saturation."  >
+     				<option value="None">None</option>
+     				<!-- in order of best FORWARD scoring when tested on /test/data/report_test_sets/(P2) Relative peak heights in spectra/Input (Test set 1) -->
+     				<option value="SIM_INTENSITY" selected="true">SIM intensities</option>
+		    		<option value="MAX_INTENSITY">Maximum intensities</option>
+     				<option value="CENTROTYPE_INTENSITY">Centrotype peak intensities</option>
+		    		<option value="MIC_INTENSITY">MIC intensities</option>
+     			</param>
+     			<param name="peakIntensityCorrectionAlgorithm"  type="select" label="Spectrum peak intensity correction algorithm (optional/experimental)" help="Whether spectrum peak heights should be adjusted according to their membership to the cluster or to their correlation to the cluster's centrotype ion"  >
+     				<option value="MEMBERSHIP_BASED">Membership based (msclust 1.0 mode)</option>
+		    		<option value="CORRELATION_BASED" selected="true">Correlation based</option>
+     			</param>
+     			<param name="simSelectionAlgorithm" type="select" label="SIM selection algorithm (experimental)" help="Set this if you want to deviate from the standard which is: allow shared SIM peaks for GC-MS data, and force unique SIM peaks for LC-MS data">
+     				<option value="" selected="true"></option>
+     				<option value="uniqueSIM">Unique SIM peak</option>
+		    		<option value="sharedSIM">Shared SIM peak</option>
+     			</param>
+     			<param name="simMassFilter" type="text" optional="true" size="30" label="SIM mass exclusion list" help="Comma-separated list of masses NOT to use as SIM peaks. E.g. '73,147,...' " />
+     			<param name="simMembershipThreshold" optional="true" type="float" size="10" label="SIM membership threshold" help="Minimum membership a peak should have to qualify as a SIM candidate. E.g. 0.8 " />
+     			<param name="simSaturationThreshold" optional="true" type="float" size="10" label="SIM saturation threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be saturated. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found this criteria, mass 0 is reported" />
+     			<param name="simAbsenseThreshold" optional="true" type="float" size="10" label="SIM absence threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be absent. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found meeting this criteria, mass 0 is reported" />
+
+     			<param name="micMembershipThreshold" optional="true" type="float" size="10" label="MIC membership threshold" help="Minimum membership a peak should have to be counted in the MIC sum. E.g. 0.8 " />
+
+     		</when>
+     		<when value="No">
+			</when>
+     	</conditional>
+
+
+	</inputs>
+	<outputs>
+	  <data name="centrotypesOut" format="msclust.csv" label="${tool.name} on ${on_string} - centrotypes file"/>
+	  <data name="simOut" format="msclust.csv" label="${tool.name} on ${on_string} - SIM file"/>
+	  <data name="micOut" format="msclust.csv" label="${tool.name} on ${on_string} - MIC file"/>
+	   <data name="mspOut" format="msp" label="${tool.name} on ${on_string} - SPECTRA file"/>
+	  <data name="classOut" format="msclust.csv" label="${tool.name} on ${on_string} - Classification file"/>
+	  <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report">
+	 	<!-- If the expression is false, the file is not created -->
+	  	<filter>( summaryReport == True )</filter>
+	  </data>
+	</outputs>
+	<tests>
+	  <!--  find out how to use -->
+	</tests>
+  <help>
+
+<!-- see also http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-targets -->
+
+.. class:: infomark
+
+This tool extracts spectra from ion-wise aligned MS(/MS) results. It uses expression profiles and
+retention times of the putative ions to cluster them. Each cluster is then used to generate
+one spectrum containing the clustered ions (peaks).
+
+.. image:: msclust_summary.png
+
+
+-----
+
+**Input**
+
+The input file should contain the following columns (in this order), followed by the sample intensity columns (one column with the
+intensity value for each sample):
+
+*ScanNR*
+
+*Ret(umin)*
+
+*Mass(uD)*
+
+*(Optional)retentionMean*
+
+*(only required if retentionMean is present)retentionSD*
+
+*N sample intensity columns...*
+
+
+-----
+
+**Output**
+
+This tools returns a number of ouptut files and a small report.
+
+**Parameters index**
+
+
+*Select the approach used for imputing missing values:* only select this if you have used a specific method to
+fill in the data gaps in the input file. One example is replacing zero values by some randomly generated low value.
+If MeTot is chosen, then a value is considered generated if: the value contains a dot '.' and some number
+other than 0 (zero) after the dot.
+
+*Effective Peaks:* Neighborhood window size to consider when calculating density. Smaller values increase
+performance but are less reliable.
+
+*Peak Width, in scans:* Scan window width of scans to consider 'close'. One can see this as the
+'tolerated variation in scans' for the apex positions of the fragment peaks composing a cluster.
+Note: if MetAlign was used, this is the variation *after* pre-processing by MetAlign.
+
+*Peak Width confidence:* The higher the confidence, the stricter the threshold.
+
+*Correlation threshold (0.0 - 1.0):* Tolerance center for pearson distance calculation. The higher this value,
+the higher the correlation between 2 items has to be for them to be considered 'close'.
+
+*Correlation threshold confidence:* The higher the confidence, the stricter the threshold. `More...`__
+
+*Potential Density reduction (0.0 - 1.0):* Reduction tolerance center for pearson distance calculation.
+The higher this value, the less the low correlated items get reduced, getting a chance to form a cluster of their own.
+
+*Potential Density reduction softness:* Reduction curve slope for pearson distance tolerance. Lower
+values = stricter separation at the value determined in 'Potential Density reduction' above
+(TODO review this comment).
+
+*Stop Criterion:* When to stop reducing and looking for new clusters. Lower values = more iterations
+
+.. __: javascript:window.open('.. image:: confidence_and_slope_params_explain.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
+
+
+-----
+
+**Output files described below**
+
+-----
+
+*SPECTRA:* this file can be submitted to NIST for identification of the spectra.
+
+`Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation`_
+
+.. _Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation: javascript:window.open('.. image:: sample_sel_and_peak_height_correction.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
+
+-----
+
+*MIC:* stands for Measured Ions Count -> it contains, for each cluster, the sum of the ion count
+values (corrected by their membership) for all MEASURED cluster ions in the given sample.
+
+The MIC for a **cluster i** in **sample s**, where **cluster i** has **n** members is thus:
+
+sum ( [intensity of member n in **sample s**] x [membership value of member n in **cluster i** ] )
+
+-----
+
+*SIM:* stands for Selective Ion Mode ->  it contains, for each cluster, the intensity values of the
+most representative member ion peak of this cluster. The most representative member peak is the one with the
+highest membership*average_intensity. This definition leads to conflicts as a peak can have a
+membership in two or more clusters. The assignment of a SIM peak to a cluster depends on
+the configured data type (LC or GC-MS). NB: this can be overruled in the "advanced settings":
+
+(1) LC-MS SIM: select SIM peak only once and for the centrotype in which this specific mass has its
+highest membership; for neighboring centrotypes use its "second best SIM", etcetera. In other words,
+if the SIM peak has been identified as the SIM in more than 1 cluster, assign as SIM to the cluster
+with highest membership. Continue searching for other SIM peaks to assign to the other clusters until
+all ambiguities are solved.
+
+(2) GC-MS SIM: the SIM peak can be "shared" by multiple clusters. However, the intensity values are corrected
+by the membership value of the peak in the cluster in case the SIM peak is "shared". If the SIM peak is not
+"shared" then the "raw" intensity values of the SIM peak are recorded in the SIM file.
+
+`Click here for more details on the SIM output file`_
+
+.. _Click here for more details on the SIM output file: javascript:window.open('.. image:: sample_SIM.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
+
+
+**References**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following papers:
+
+Y. M. Tikunov, S. Laptenok, R. D. Hall, A. Bovy, and R. C. H. de Vos (2012).
+MSClust: a tool for unsupervised mass spectra extraction of
+chromatography-mass spectrometry ion-wise aligned data
+http://dx.doi.org/10.1007%2Fs11306-011-0368-2
+
+  </help>
+</tool>
--- a/msclust2.0.1.xml	Thu Jan 16 13:22:38 2014 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,289 +0,0 @@
-<tool name="MsClust" id="msclust2" version="2.0.1">
-	<description>Extracts fragmentation spectra from aligned data</description>
-	<!--
-	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
-	       java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000
-	                    //////////////////////////
-
-	       TODO in command below: add conditionals according to options of using or NOT the tolerances/thresholds from previous steps
-	    -->
-	<command interpreter="java -jar ">
-	    MsClust.jar
-	   	-peaksFileName $inputPeaks
-	   	-dataType $dataType
-        -imputationMethod $imputationMethod.type
-        #if $imputationMethod.type == "valueRange"
-        	-rangeUpperLimit $imputationMethod.rangeUpperLimit
-        #end if
-		-plInputFormat "metalign"
-		-potDensFuncType $potDensFuncType.type
-		-centerSelectionType $centerSelectionType.type
-		-clusteringType $clusteringType.type
-		-neighborhoodWindowSize $potDensFuncType.pdf_neighborhoodWindowSize
-		-clusterSearchStopCriterium $centerSelectionType.cs_stop_criterion
-		-pearsonDistTreshold $potDensFuncType.pdf_pears_treshold
-		-pearsonTresholdConfidence $potDensFuncType.pdf_pears_conf
-		-pearsonPDReductionThreshold $centerSelectionType.cs_pears_pd_reductionTreshold
-		-pearsonPDReductionSlope $centerSelectionType.cs_pears_pd_reductionSlope
-		-scanDistTol $potDensFuncType.pdf_scan_toler
-		-scanDistanceConfidence $potDensFuncType.pdf_scan_conf
-		-centrotypesOut $centrotypesOut
-		-simOut $simOut
-		-micOut $micOut
-		-mspOut $mspOut
-		-classOut $classOut
-		-outReport $htmlReportFile
-	    -outReportPicturesPath $htmlReportFile.files_path
-        #if $clusteringType.type == "fuzzyCMeans"
-        	-fcmMembershipWeightingExponent $clusteringType.fcmMembershipWeightingExponent
-			-fcmStopCriterion $clusteringType.fcmStopCriterion
-			-fcmCorrelationWeight $clusteringType.fcmCorrelationWeight
-			-fcmFinalAssemblyType $clusteringType.finalClusterAssembly.type
-			#if $clusteringType.finalClusterAssembly.type == "membershipBased"
-				-fcmMembershipCutoff $clusteringType.finalClusterAssembly.fcmMembershipCutoff
-			#end if
-        #end if
-		-verbose "false"
-	    #if $advancedSettings.settings == True
-	    	-advancedSettings YES
-	    	-saturationLimit $advancedSettings.saturationLimit
-	    	-sampleSelectionSortType $advancedSettings.sampleSelectionSortType
-	    	-simSelectionAlgorithm $advancedSettings.simSelectionAlgorithm
-	    	-simMassFilter "$advancedSettings.simMassFilter"
-	    	-simMembershipThreshold $advancedSettings.simMembershipThreshold
-	    	-simSaturationThreshold $advancedSettings.simSaturationThreshold
-	    	-simAbsenseThreshold $advancedSettings.simAbsenseThreshold
-	    	-micMembershipThreshold $advancedSettings.micMembershipThreshold
-	    	-peakIntensityCorrectionAlgorithm $advancedSettings.peakIntensityCorrectionAlgorithm
-        #else
-        	-advancedSettings YES
-        	-sampleSelectionSortType SIM_INTENSITY
-        	-peakIntensityCorrectionAlgorithm CORRELATION_BASED
-        #end if
-
-	</command>
-	<inputs>
-	<!--  <param name="rankingWeightConfig" type="text" area="true" size="11x70" label="NB - TEST VERSION"
-value="VERSION BEING TESTED AT THIS MOMENT...NOT READY FOR USE..."/>
-	-->
-	 	<param name="inputPeaks" type="data" format="txt" label="Ion-wise aligned data (e.g. MetAlign output data)" />
-		<param name="dataType" type="select" size="30" label="Data type">
-				<option value="gcms"  selected="true">GC-MS</option>
-				<option value="lcms">LC-MS</option>
-			</param>
-	 	<conditional name="imputationMethod">
-			<param name="type" type="select" size="30" label="Select the approach used for imputing missing values (optional)" help="select how you generated the values to fill in the data gaps">
-				<option value="none" >none</option>
-				<option value="metot" selected="true">MeTot</option>
-				<option value="valueRange">Values range</option>
-			</param>
-			<when value="valueRange">
-				<param name="rangeUpperLimit" type="integer" size="10" value="0" label="Range upper limit" help="values up to this limit will be considered 'generated' values"  />
-			</when>
-		</conditional>
-	 	<conditional name="potDensFuncType">
-		    <param name="type" type="select" size="30" label="Select PD function type =====================================================">
-		      <option value="original" selected="true">Original</option>
-		    </param>
-		    <when value="original">
-		      <param name="pdf_neighborhoodWindowSize" type="integer" size="10" value="200" label="Effective Peaks"  />
-		      <param name="pdf_scan_toler" type="float" size="10" value="10" label="Peak Width, in scans"  />
-		      <param name="pdf_scan_conf" type="float" size="10" value="80" label="Peak Width confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" />
-		      <param name="pdf_pears_treshold" type="float" size="10" value="0.8" label="Correlation threshold (0.0 - 1.0)" />
-		      <param name="pdf_pears_conf" type="float" size="10" value="98.0" label="Correlation threshold confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" />
-		    </when>
-		</conditional>
-		<conditional name="centerSelectionType">
-		    <param name="type" type="select" label="Initial Centers selection type ==================================================" >
-		      <option value="original" selected="true">Original - Subtractive potential reductions with stop criterion and REUSE tolerances (from PD function)</option>
-		    </param>
-		    <when value="original">
-		      <param name="cs_pears_pd_reductionTreshold" type="float" size="10" value="0.8" label="Potential Density reduction (0.0 - 1.0)"  />
-		      <param name="cs_pears_pd_reductionSlope" type="float" size="10" value="0.01" label="Potential Density reduction softness "  />
-		      <param name="cs_stop_criterion" type="float" size="10" value="2" label="Stop Criterion "  />
-			</when>
-		</conditional>
-		<conditional name="clusteringType">
-		    <param name="type" type="select" label="Classify using ===========================================================">
-		      <option value="original" selected="true">Original - Fuzzy clustering, keep original centers and REUSE (scan distance) tolerances</option>
-		      <option value="fuzzyCMeans">(experimental) Fuzzy C-Means - Fuzzy clustering, optimize centers</option>
-		    </param>
-		    <when value="original">
-		    <!-- nothing -->
-			</when>
-		    <when value="originalNewTol">
-		      <param name="clust_scan_toler" type="float" size="10" value="10" label="Peak Width, in scans"  />
-		      <param name="clust_scan_slope" type="float" size="10" value="2" label="Peak Width margin softness"  />
-		    </when>
-		    <when value="fuzzyCMeans">
-				<param name="fcmMembershipWeightingExponent" type="float" size="10" value="2.0" label="Membership Weighting Exponent" help="Influences cluster center repositioning in the iterations 1.1 (exploratory) to around 3.0 (conservative)" />
-				<param name="fcmStopCriterion" type="float" size="10" value="0.05" label="Stop Criterion" help="When convergence is 'reached' (e.g. 0.05 means memberships only changed with 5% in last iteration)" />
-				<param name="fcmCorrelationWeight" type="float" size="10" value="2" label="Correlation weight factor" help="Increase this if you think the correlation is reliable (e.g. you have a high number of samples)" />
-				<conditional name="finalClusterAssembly">
-					<param name="type" type="select" label="Final cluster assembly" >
-				      <option value="original" selected="true">Original - distance based</option>
-				      <option value="membershipBased">Membership based</option>
-				    </param>
-					<when value="membershipBased">
-						<param name="fcmMembershipCutoff" type="select" label="Maximum allowed peak overlap" >
-							<option value="0.05" >~7 clusters</option>
-							<option value="0.10" >~5 clusters</option>
-							<option value="0.20" >~3 clusters</option>
-						</param>
-					</when>
-					<when value="original">
-					    <!-- nothing -->
-					</when>
-				</conditional>
-		    </when>
-		</conditional>
-
-		<param name="summaryReport" type="boolean" checked="true" label="Generate summary report" help="NB: this will increase the processing time (in some cases up to a few extra minutes)"/>
-
-        <conditional name="advancedSettings">
-     		<param name="settings" type="boolean" truevalue="Yes" falsevalue="No" checked="false" label="Advanced settings ========================================================"/>
-     		<when value="Yes">
-     			<param name="saturationLimit" optional="true" type="integer" size="10" label="Saturation limit (optional)" help="fill in if you have saturation problems in your data"  />
-	 			<param name="sampleSelectionSortType"  type="select" label="Sample selection scheme for spectrum peak intensity correction algorithm (optional/experimental)" help="The intensity values to use to select the samples for each cluster/metabolite in which it is most intense/abundant. These samples are used in the peak intensity correction (see parameter below). Use this option to try to avoid samples that have insufficient signal or saturation."  >
-     				<option value="None">None</option>
-     				<!-- in order of best FORWARD scoring when tested on /test/data/report_test_sets/(P2) Relative peak heights in spectra/Input (Test set 1) -->
-     				<option value="SIM_INTENSITY" selected="true">SIM intensities</option>
-		    		<option value="MAX_INTENSITY">Maximum intensities</option>
-     				<option value="CENTROTYPE_INTENSITY">Centrotype peak intensities</option>
-		    		<option value="MIC_INTENSITY">MIC intensities</option>
-     			</param>
-     			<param name="peakIntensityCorrectionAlgorithm"  type="select" label="Spectrum peak intensity correction algorithm (optional/experimental)" help="Whether spectrum peak heights should be adjusted according to their membership to the cluster or to their correlation to the cluster's centrotype ion"  >
-     				<option value="MEMBERSHIP_BASED">Membership based (msclust 1.0 mode)</option>
-		    		<option value="CORRELATION_BASED" selected="true">Correlation based</option>
-     			</param>
-     			<param name="simSelectionAlgorithm" type="select" label="SIM selection algorithm (experimental)" help="Set this if you want to deviate from the standard which is: allow shared SIM peaks for GC-MS data, and force unique SIM peaks for LC-MS data">
-     				<option value="" selected="true"></option>
-     				<option value="uniqueSIM">Unique SIM peak</option>
-		    		<option value="sharedSIM">Shared SIM peak</option>
-     			</param>
-     			<param name="simMassFilter" type="text" optional="true" size="30" label="SIM mass exclusion list" help="Comma-separated list of masses NOT to use as SIM peaks. E.g. '73,147,...' " />
-     			<param name="simMembershipThreshold" optional="true" type="float" size="10" label="SIM membership threshold" help="Minimum membership a peak should have to qualify as a SIM candidate. E.g. 0.8 " />
-     			<param name="simSaturationThreshold" optional="true" type="float" size="10" label="SIM saturation threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be saturated. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found this criteria, mass 0 is reported" />
-     			<param name="simAbsenseThreshold" optional="true" type="float" size="10" label="SIM absence threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be absent. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found meeting this criteria, mass 0 is reported" />
-
-     			<param name="micMembershipThreshold" optional="true" type="float" size="10" label="MIC membership threshold" help="Minimum membership a peak should have to be counted in the MIC sum. E.g. 0.8 " />
-
-     		</when>
-     	</conditional>
-
-
-	</inputs>
-	<outputs>
-	  <data name="centrotypesOut" format="msclust.csv" label="${tool.name} on ${on_string} - centrotypes file"/>
-	  <data name="simOut" format="msclust.csv" label="${tool.name} on ${on_string} - SIM file"/>
-	  <data name="micOut" format="msclust.csv" label="${tool.name} on ${on_string} - MIC file"/>
-	   <data name="mspOut" format="msp" label="${tool.name} on ${on_string} - SPECTRA file"/>
-	  <data name="classOut" format="msclust.csv" label="${tool.name} on ${on_string} - Classification file"/>
-	  <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report">
-	 	<!-- If the expression is false, the file is not created -->
-	  	<filter>( summaryReport == True )</filter>
-	  </data>
-	</outputs>
-	<tests>
-	  <!--  find out how to use -->
-	</tests>
-  <help>
-
-<!-- see also http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-targets -->
-
-.. class:: infomark
-
-This tool extracts spectra from ion-wise aligned MS(/MS) results. It uses expression profiles and
-retention times of the putative ions to cluster them. Each cluster is then used to generate
-one spectrum containing the clustered ions (peaks).
-
-.. image:: $PATH_TO_IMAGES/msclust_summary.png
-
-
------
-
-**Output**
-
-This tools returns a number of ouptut files and a small report.
-
-**Parameters index**
-
-
-*Select the approach used for imputing missing values:* only select this if you have used a specific method to
-fill in the data gaps in the input file. One example is replacing zero values by some randomly generated low value.
-If MeTot is chosen, then a value is considered generated if: the value contains a dot '.' and some number
-other than 0 (zero) after the dot.
-
-*Effective Peaks:* Neighborhood window size to consider when calculating density. Smaller values increase
-performance but are less reliable.
-
-*Peak Width, in scans:* Scan window width of scans to consider 'close'. One can see this as the
-'tolerated variation in scans' for the apex positions of the fragment peaks composing a cluster.
-Note: if MetAlign was used, this is the variation *after* pre-processing by MetAlign.
-
-*Peak Width confidence:* The higher the confidence, the stricter the threshold.
-
-*Correlation threshold (0.0 - 1.0):* Tolerance center for pearson distance calculation. The higher this value,
-the higher the correlation between 2 items has to be for them to be considered 'close'.
-
-*Correlation threshold confidence:* The higher the confidence, the stricter the threshold. `More...`__
-
-*Potential Density reduction (0.0 - 1.0):* Reduction tolerance center for pearson distance calculation.
-The higher this value, the less the low correlated items get reduced, getting a chance to form a cluster of their own.
-
-*Potential Density reduction softness:* Reduction curve slope for pearson distance tolerance. Lower
-values = stricter separation at the value determined in 'Potential Density reduction' above
-(TODO review this comment).
-
-*Stop Criterion:* When to stop reducing and looking for new clusters. Lower values = more iterations
-
-.. __: javascript:window.open('$PATH_TO_IMAGES/confidence_and_slope_params_explain.png','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
-
-
------
-
-**Output files described below**
-
------
-
-*SPECTRA:* this file can be submitted to NIST for identification of the spectra.
-
-`Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation`_
-
-.. _Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation: javascript:window.open('$PATH_TO_IMAGES/sample_sel_and_peak_height_correction.png','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
-
------
-
-*MIC:* stands for Measured Ions Count -> it contains, for each cluster, the sum of the ion count
-values (corrected by their membership) for all MEASURED cluster ions in the given sample.
-
-The MIC for a **cluster i** in **sample s**, where **cluster i** has **n** members is thus:
-
-sum ( [intensity of member n in **sample s**] x [membership value of member n in **cluster i** ] )
-
------
-
-*SIM:* stands for Selective Ion Mode ->  it contains, for each cluster, the intensity values of the
-most representative member ion peak of this cluster. The most representative member peak is the one with the
-highest membership*average_intensity. This definition leads to conflicts as a peak can have a
-membership in two or more clusters. The assignment of a SIM peak to a cluster depends on
-the configured data type (LC or GC-MS). NB: this can be overruled in the "advanced settings":
-
-(1) LC-MS SIM: select SIM peak only once and for the centrotype in which this specific mass has its
-highest membership; for neighboring centrotypes use its "second best SIM", etcetera. In other words,
-if the SIM peak has been identified as the SIM in more than 1 cluster, assign as SIM to the cluster
-with highest membership. Continue searching for other SIM peaks to assign to the other clusters until
-all ambiguities are solved.
-
-(2) GC-MS SIM: the SIM peak can be "shared" by multiple clusters. However, the intensity values are corrected
-by the membership value of the peak in the cluster in case the SIM peak is "shared". If the SIM peak is not
-"shared" then the "raw" intensity values of the SIM peak are recorded in the SIM file.
-
-`Click here for more details on the SIM output file`_
-
-.. _Click here for more details on the SIM output file: javascript:window.open('$PATH_TO_IMAGES/sample_SIM.png','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
-
-
-
-  </help>
-</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/primsfilters.py	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,44 @@
+import logging
+log = logging.getLogger( __name__ )
+
+
+def restrict_prims_metabolomics( context, tool ):
+    """
+    This tool filter will hide prims_metabolomics tools for non-metabolomics users.
+    This can be enabled by adding the following to the
+     ``app:main`` section of ``universe_wsgi.ini``::
+
+        tool_filters = primsfilters:restrict_prims_metabolomics
+
+    and by adding this file to the folder:
+
+        <galaxy-dist>/lib/galaxy/tools/filters
+
+    This is optional and can be used in case some control is desired on whom
+    gets to see the prims_metabolomics tools. When not using this file and the
+    settings mentioned above, all prims_metabolomics tools will be visible to
+    all users.
+    """
+    # for debugging: import pydevd;pydevd.settrace("L0136815.wurnet.nl")
+    user = context.trans.user
+    metabolomics_tools = [ "msclust2", "combine_output", "create_poly_model", "lookup_library",
+                          "NDIStext2tabular", "rankfilterGCMS_tabular", "filter_on_rank",
+                          "export_to_metexp_tabular", "query_metexp" ]
+    found_match = False
+    # iterate over the tool (partial)ids and look for a match (this is compatible with tool shed given ids):
+    for partial_id in metabolomics_tools:
+        if tool.id.find("/"+ partial_id + "/") >= 0:
+            found_match = True
+            break
+    # the second part of this if is compatible with the ids when NOT using tool shed:
+    if found_match or tool.id in metabolomics_tools:
+        # logging.warn( 'FILTER MATCHED: %s' %(tool.name))
+
+        for user_role in user.roles:
+            if user_role.role.name == "PRIMS_METABOLOMICS":
+                return True
+        # not found to have the role, return false:
+        return False
+    else:
+        # return true for any other tool
+        return True
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_mass_repos.py	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,289 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to query a set of accurate mass values detected by high-resolution mass spectrometers
+against various repositories/services such as METabolomics EXPlorer database or the
+MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).
+
+It will take the input file and for each record it will query the
+molecular mass in the selected repository/service. If one or more compounds are found
+then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected repository/service.
+
+The service should implement the following interface:
+
+http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth   (txth means there is guaranteed to be a header line before the data)
+
+The output should be tab separated and should contain the following columns (in this order)
+db-name    molecular-formula    dbe    formula-weight    id    description
+
+
+'''
+import csv
+import sys
+import fileinput
+import urllib2
+import time
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2014, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_file(in_xsv, delim='\t'):
+    '''
+    Generic method to parse a tab-separated file returning a dictionary with named columns
+    @param in_csv: input filename to be parsed
+    '''
+    data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+    return _process_data(data)
+
+def _process_data(data):
+
+    header = data.pop(0)
+    # Create dictionary with column name as key
+    output = OrderedDict()
+    for index in xrange(len(header)):
+        output[header[index]] = [row[index] for row in data]
+    return output
+
+
+def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):
+
+    '''
+    This method will iterate over the record in the input_data and
+    will enrich them with the related information found (if any) in the
+    chosen repository/service
+
+    # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+    '''
+    merged = []
+
+    for i in xrange(len(input_data[input_data.keys()[0]])):
+        # Get the record in same dictionary format as input_data, but containing
+        # a value at each column instead of a list of all values of all records:
+        input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+
+        # read the molecular mass :
+        molecular_mass = input_data_record[molecular_mass_col]
+
+
+        # search for related records in repository/service:
+        data_found = None
+        if molecular_mass != "":
+            molecular_mass = float(molecular_mass)
+
+            # 1- search for data around this MM:
+            query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"
+
+            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+            data_type_found = "MM"
+
+
+        if data_found == None:
+            # If still nothing found, just add empty columns
+            extra_cols = ['', '','','','','']
+        else:
+            # Add info found:
+            extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+
+        # Take all data and merge it into a "flat"/simple array of values:
+        field_values_list = _merge_data(input_data_record, extra_cols)
+
+        merged.append(field_values_list)
+
+    # return the merged/enriched records:
+    return merged
+
+
+def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+    '''
+    This method will go over the data found and will return a
+    list with the following items:
+    - details of hits found :
+         db-name    molecular-formula    dbe    formula-weight    id    description
+    - Link that executes same query
+
+    '''
+
+    # set() makes a unique list:
+    db_name_set = []
+    molecular_formula_set = []
+    id_set = []
+    description_set = []
+
+
+    if 'db-name' in data_found:
+        db_name_set = set(data_found['db-name'])
+    elif '# db-name' in data_found:
+        db_name_set = set(data_found['# db-name'])
+    if 'molecular-formula' in data_found:
+        molecular_formula_set = set(data_found['molecular-formula'])
+    if 'id' in data_found:
+        id_set = set(data_found['id'])
+    if 'description' in data_found:
+        description_set = set(data_found['description'])
+
+    result = [data_type_found,
+              _to_xsv(db_name_set),
+              _to_xsv(molecular_formula_set),
+              _to_xsv(id_set),
+              _to_xsv(description_set),
+              #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
+              "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+    return result
+
+
+def _to_xsv(data_set):
+    result = ""
+    for item in data_set:
+        result = result + str(item) + "|"
+    return result
+
+
+def _fire_query_and_return_dict(url):
+    '''
+    This method will fire the query as a web-service call and
+    return the results as a list of dictionary objects
+    '''
+
+    try:
+        data = urllib2.urlopen(url).read()
+
+        # transform to dictionary:
+        result = []
+        data_rows = data.split("\n")
+
+        # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service):
+        data_rows_to_remove = []
+        for data_row in data_rows:
+            if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row):
+                data_rows_to_remove.append(data_row)
+
+        for data_row in data_rows_to_remove:
+            data_rows.remove(data_row)
+
+        # check if there is any data in the response:
+        if len(data_rows) <= 1 or data_rows[1].strip() == '':
+            # means there is only the header row...so no hits:
+            return None
+
+        for data_row in data_rows:
+            if not data_row.strip() == '':
+                row_as_list = _str_to_list(data_row, delimiter='\t')
+                result.append(row_as_list)
+
+        # return result processed into a dict:
+        return _process_data(result)
+
+    except urllib2.HTTPError, e:
+        raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+    except urllib2.URLError, e:
+        raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")
+
+def _str_to_list(data_row, delimiter='\t'):
+    result = []
+    for column in data_row.split(delimiter):
+        result.append(column)
+    return result
+
+
+# alternative: ?
+#     s = requests.Session()
+#     s.verify = False
+#     #s.auth = (token01, token02)
+#     resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+#     content = resp.content
+#     # transform to dictionary:
+
+
+
+
+def _merge_data(input_data_record, extra_cols):
+    '''
+    Adds the extra information to the existing data record and returns
+    the combined new record.
+    '''
+    record = []
+    for column in input_data_record:
+        record.append(input_data_record[column])
+
+
+    # add extra columns
+    for column in extra_cols:
+        record.append(column)
+
+    return record
+
+
+def _save_data(data_rows, headers, out_csv):
+    '''
+    Writes tab-separated data to file
+    @param data_rows: dictionary containing merged/enriched dataset
+    @param out_csv: output csv file
+    '''
+
+    # Open output file for writing
+    outfile_single_handle = open(out_csv, 'wb')
+    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+    # Write headers
+    output_single_handle.writerow(headers)
+
+    # Write one line for each row
+    for data_row in data_rows:
+        output_single_handle.writerow(data_row)
+
+def _get_repository_URL(repository_file):
+    '''
+    Read out and return the URL stored in the given file.
+    '''
+    file_input = fileinput.input(repository_file)
+    try:
+        for line in file_input:
+            if line[0] != '#':
+                # just return the first line that is not a comment line:
+                return line
+    finally:
+        file_input.close()
+
+
+def main():
+    '''
+    Query main function
+
+    The input file can be any tabular file, as long as it contains a column for the molecular mass.
+    This column is then used to query against the chosen repository/service Database.
+    '''
+    seconds_start = int(round(time.time()))
+
+    input_file = sys.argv[1]
+    molecular_mass_col = sys.argv[2]
+    repository_file = sys.argv[3]
+    error_margin = float(sys.argv[4])
+    margin_unit = sys.argv[5]
+    output_result = sys.argv[6]
+
+    # Parse repository_file to find the URL to the service:
+    repository_dblink = _get_repository_URL(repository_file)
+
+    # Parse tabular input file into dictionary/array:
+    input_data = _process_file(input_file)
+
+    # Query data against repository :
+    enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)
+    headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ',
+                                   'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits']  #TODO - add min and max formula weigth columns
+
+    _save_data(enriched_data, headers, output_result)
+
+    seconds_end = int(round(time.time()))
+    print "Took " + str(seconds_end - seconds_start) + " seconds"
+
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_mass_repos.xml	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,106 @@
+<tool id="query_mass_repos"
+    name="METEXP - Find elemental composition formulas based on mass values "
+    version="0.1.0">
+  <description>Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers</description>
+  <command interpreter="python">
+    query_mass_repos.py
+    $input_file
+    "$molecular_mass_col"
+    "$repository_file"
+    $error_margin
+    $margin_unit
+    $output_result
+  </command>
+  <inputs>
+
+   <param name="input_file" format="tabular" type="data"
+        label="Input file"
+    	help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/>
+
+   <param name="molecular_mass_col" type="text" size="50"
+           label="Molecular mass column name"
+           value="MM"
+           help="Name of the column containing the molecular mass information (in the given input file)" />
+
+   <param name="repository_file" type="select" label="Repository/service to query"
+      		 help="Select the repository/service which should be queried"
+      		 dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_MassSearch_Services")'/>
+
+   <param name="error_margin" type="float" size="10"
+           label="Error marging"
+           value="0.01"
+           help="Mass difference allowed when searching in the repositories for a mass match." />
+
+   <param name="margin_unit" type="select" label="Margin unit">
+	  	<option value="ms" selected="True">ms</option>
+	    <option value="ppm">ppm</option>
+   </param>
+   <!-- TODO
+   <param name="metexp_access_key" type="text" size="50"
+           label="(Optional)MetExp access key"
+           value=""
+           help="Key needed to get access to MetExp services. Fill in if MetExp service was selected" />    -->
+
+  </inputs>
+  <outputs>
+    <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" />
+  </outputs>
+  <code file="match_library.py" /> <!-- file containing get_directory_files function used above-->
+  <help>
+.. class:: infomark
+
+This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher
+for elemental compositions from accurate mass values detected by high-resolution mass spectrometers.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected repository. If one or more compounds are found in the
+repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected repository.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass.
+
+**Services that can be queried**
+
+================= =========================================================================
+Database          Description
+----------------- -------------------------------------------------------------------------
+PRI-MetExp        LC-MS and GC-MS data from experiments from the metabolomics group at
+                  Plant Research International. NB: restricted access to employees with
+                  access key.
+ExactMassDB       A database of possible elemental compositions consits of C: 100,
+                  H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and
+                  the Lewis valence rules.
+                  (via /mfsearcher/exmassdb/)
+ExactMassDB-HR2   HR2, which is one of the fastest tools for calculation of elemental
+                  compositions, filters some elemental compositions according to
+                  the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2
+                  database returns the same result as does HR2 with the same atom kind
+                  and number condition as that used in construction of the ExactMassDB.
+                  (via /mfsearcher/exmassdb-hr2/)
+Pep1000           A database of possible linear polypeptides that are
+                  constructed with 20 kinds of amino acids and having molecular
+                  weights smaller than 1000.
+                  (via /mfsearcher/pep1000/)
+KEGG              Re-calculated compound data from KEGG. Weekly updated.
+                  (via /mfsearcher/kegg/)
+KNApSAcK          Re-calculated compound data from KNApSAcK.
+                  (via /mfsearcher/knapsack/)
+Flavonoid Viewer  Re-calculated compound data from Flavonoid Viewer .
+                  (via /mfsearcher/flavonoidviewer/
+LipidMAPS         Re-calculated compound data from LIPID MAPS.
+                  (via /mfsearcher/lipidmaps/)
+HMDB              Re-calculated compound data from Human Metabolome Database (HMDB)
+                  Version 3.5.
+                  (via /mfsearcher/hmdb/)
+PubChem           Re-calculated compound data from PubChem. Monthly updated.
+                  (via /mfsearcher/pubchem/)
+================= =========================================================================
+
+Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.py	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected MetExp DB.
+'''
+import csv
+import sys
+import fileinput
+import urllib2
+import time
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2014, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_file(in_xsv, delim='\t'):
+    '''
+    Generic method to parse a tab-separated file returning a dictionary with named columns
+    @param in_csv: input filename to be parsed
+    '''
+    data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+    return _process_data(data)
+
+def _process_data(data):
+
+    header = data.pop(0)
+    # Create dictionary with column name as key
+    output = OrderedDict()
+    for index in xrange(len(header)):
+        output[header[index]] = [row[index] for row in data]
+    return output
+
+
+def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
+    '''
+    This method will iterate over the record in the input_data and
+    will enrich them with the related information found (if any) in the
+    MetExp Database.
+
+    # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+    '''
+    merged = []
+
+    for i in xrange(len(input_data[input_data.keys()[0]])):
+        # Get the record in same dictionary format as input_data, but containing
+        # a value at each column instead of a list of all values of all records:
+        input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+
+        # read the molecular mass and formula:
+        cas_id = input_data_record[casid_col]
+        formula = input_data_record[formula_col]
+        molecular_mass = input_data_record[molecular_mass_col]
+
+        # search for related records in MetExp:
+        data_found = None
+        if cas_id != "undef":
+            # 1- search for other experiments where this CAS id has been found:
+            query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
+            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+            data_type_found = "CAS"
+        if data_found == None:
+            # 2- search for other experiments where this FORMULA has been found:
+            query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
+            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+            data_type_found = "FORMULA"
+        if data_found == None:
+            # 3- search for other experiments where this MM has been found:
+            query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method
+            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+            data_type_found = "MM"
+
+        if data_found == None:
+            # If still nothing found, just add empty columns
+            extra_cols = ['', '','','','','','','']
+        else:
+            # Add info found:
+            extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+
+        # Take all data and merge it into a "flat"/simple array of values:
+        field_values_list = _merge_data(input_data_record, extra_cols)
+
+        merged.append(field_values_list)
+
+    # return the merged/enriched records:
+    return merged
+
+
+def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+    '''
+    This method will go over the data found and will return a
+    list with the following items:
+    - Experiment details where hits have been found :
+        'organism', 'tissue','experiment_name','user_name','column_type'
+    - Link that executes same query
+
+    '''
+    # set() makes a unique list:
+    organism_set = []
+    tissue_set = []
+    experiment_name_set = []
+    user_name_set = []
+    column_type_set = []
+    cas_nr_set = []
+
+    if 'organism' in data_found:
+        organism_set = set(data_found['organism'])
+    if 'tissue' in data_found:
+        tissue_set = set(data_found['tissue'])
+    if 'experiment_name' in data_found:
+        experiment_name_set = set(data_found['experiment_name'])
+    if 'user_name' in data_found:
+        user_name_set = set(data_found['user_name'])
+    if 'column_type' in data_found:
+        column_type_set = set(data_found['column_type'])
+    if 'CAS' in data_found:
+        cas_nr_set = set(data_found['CAS'])
+
+
+    result = [data_type_found,
+              _to_xsv(organism_set),
+              _to_xsv(tissue_set),
+              _to_xsv(experiment_name_set),
+              _to_xsv(user_name_set),
+              _to_xsv(column_type_set),
+              _to_xsv(cas_nr_set),
+              #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
+              "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+    return result
+
+
+def _to_xsv(data_set):
+    result = ""
+    for item in data_set:
+        result = result + str(item) + "|"
+    return result
+
+
+def _fire_query_and_return_dict(url):
+    '''
+    This method will fire the query as a web-service call and
+    return the results as a list of dictionary objects
+    '''
+
+    try:
+        data = urllib2.urlopen(url).read()
+
+        # transform to dictionary:
+        result = []
+        data_rows = data.split("\n")
+
+        # check if there is any data in the response:
+        if len(data_rows) <= 1 or data_rows[1].strip() == '':
+            # means there is only the header row...so no hits:
+            return None
+
+        for data_row in data_rows:
+            if not data_row.strip() == '':
+                row_as_list = _str_to_list(data_row, delimiter='\t')
+                result.append(row_as_list)
+
+        # return result processed into a dict:
+        return _process_data(result)
+
+    except urllib2.HTTPError, e:
+        raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+    except urllib2.URLError, e:
+        raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")
+
+def _str_to_list(data_row, delimiter='\t'):
+    result = []
+    for column in data_row.split(delimiter):
+        result.append(column)
+    return result
+
+
+# alternative: ?
+#     s = requests.Session()
+#     s.verify = False
+#     #s.auth = (token01, token02)
+#     resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+#     content = resp.content
+#     # transform to dictionary:
+
+
+
+
+def _merge_data(input_data_record, extra_cols):
+    '''
+    Adds the extra information to the existing data record and returns
+    the combined new record.
+    '''
+    record = []
+    for column in input_data_record:
+        record.append(input_data_record[column])
+
+
+    # add extra columns
+    for column in extra_cols:
+        record.append(column)
+
+    return record
+
+
+def _save_data(data_rows, headers, out_csv):
+    '''
+    Writes tab-separated data to file
+    @param data_rows: dictionary containing merged/enriched dataset
+    @param out_csv: output csv file
+    '''
+
+    # Open output file for writing
+    outfile_single_handle = open(out_csv, 'wb')
+    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+    # Write headers
+    output_single_handle.writerow(headers)
+
+    # Write one line for each row
+    for data_row in data_rows:
+        output_single_handle.writerow(data_row)
+
+def _get_metexp_URL(metexp_dblink_file):
+    '''
+    Read out and return the URL stored in the given file.
+    '''
+    file_input = fileinput.input(metexp_dblink_file)
+    try:
+        for line in file_input:
+            if line[0] != '#':
+                # just return the first line that is not a comment line:
+                return line
+    finally:
+        file_input.close()
+
+
+def main():
+    '''
+    MetExp Query main function
+
+    The input file can be any tabular file, as long as it contains a column for the molecular mass
+    and one for the formula of the respective identification. These two columns are then
+    used to query against MetExp Database.
+    '''
+    seconds_start = int(round(time.time()))
+
+    input_file = sys.argv[1]
+    casid_col = sys.argv[2]
+    formula_col = sys.argv[3]
+    molecular_mass_col = sys.argv[4]
+    metexp_dblink_file = sys.argv[5]
+    separation_method = sys.argv[6]
+    output_result = sys.argv[7]
+
+    # Parse metexp_dblink_file to find the URL to the MetExp service:
+    metexp_dblink = _get_metexp_URL(metexp_dblink_file)
+
+    # Parse tabular input file into dictionary/array:
+    input_data = _process_file(input_file)
+
+    # Query data against MetExp DB :
+    enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
+    headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
+                                   'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
+
+    _save_data(enriched_data, headers, output_result)
+
+    seconds_end = int(round(time.time()))
+    print "Took " + str(seconds_end - seconds_start) + " seconds"
+
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.xml	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,69 @@
+<tool id="query_metexp"
+    name="METEXP - Query Database "
+    version="0.1.0">
+  <description>Query a set of identifications against the METabolomics EXPeriments database</description>
+  <command interpreter="python">
+    query_metexp.py
+    $input_file
+    "$casid_col"
+    "$formula_col"
+    "$molecular_mass_col"
+    "$metexp_dblink_file"
+    $separation_method
+    $output_result
+  </command>
+  <inputs>
+
+   <param name="input_file" format="tabular" type="data"
+        label="Input file"
+    	help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/>
+
+  <param name="separation_method" type="select" label="Data type to query">
+  	<option value="GC" selected="True">GC</option>
+    <option value="LC">LC</option>
+  </param>
+
+
+   <param name="casid_col" type="text" size="50"
+           label="CAS ID column name"
+           value="CAS"
+           help="Name of the column containing the CAS code information (in the given input file)" />
+   <param name="formula_col" type="text" size="50"
+           label="Formula ID column name"
+           value="FORMULA"
+           help="Name of the column containing the formula information (in the given input file)" />
+   <param name="molecular_mass_col" type="text" size="50"
+           label="Molecular mass column name"
+           value="MM"
+           help="Name of the column containing the molecular mass information (in the given input file)" />
+
+   <param name="metexp_dblink_file" type="select" label="MetExp DB to query"
+      		 help="Select the MetExp Database/backend which should be queried"
+      		 dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_Databases")'/>
+
+
+  </inputs>
+  <outputs>
+    <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" />
+  </outputs>
+  <code file="match_library.py" /> <!-- file containing get_directory_files function used above-->
+  <help>
+.. class:: infomark
+
+This tool will Query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected MetExp DB.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass
+and one for the formula of the respective identification.
+
+
+  </help>
+</tool>
--- a/rankfilterGCMS_tabular.xml	Thu Jan 16 13:22:38 2014 +0100
+++ b/rankfilterGCMS_tabular.xml	Fri Oct 24 12:52:56 2014 +0200
@@ -3,11 +3,14 @@
   <command interpreter="python">rankfilter_GCMS/rankfilter.py $input_file</command>
   <inputs>
     <param format="tabular" name="sample" type="data" label="Sample File"
-	       help="Converted PDF file in tabular format" />
+	       help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" />
 	<!-- question: is this calibration file not column specific as it includes RT info?? -->
-    <param name="calibration"  type="select" label="Calibration File"
+    <!-- this one should be input file for now:<param name="calibration"  type="select" label="Calibration File"
            help="Calibration file with reference masses (e.g. alkanes) with their RT and RI values"
     		dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_Calibration_Files")'/>
+    		-->
+    <param name="calibration" format="any" type="data" label="Calibration File"
+           help="Calibration file containing reference masses (e.g. alkanes) with their respective RT and RI values"/>

     <param name="analysis_type" type="select" format="text" label="Analysis Type"
     	   help="Select the type of analysis that has been used to generate the sample file">
--- a/rankfilter_GCMS/pdfread.py	Thu Jan 16 13:22:38 2014 +0100
+++ b/rankfilter_GCMS/pdfread.py	Fri Oct 24 12:52:56 2014 +0200
@@ -52,8 +52,9 @@
     for line in hit_list:
         line = line.strip().translate(None, '\r')
         if line != '':
-            hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')
-
+            hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')  #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler!
+                                                                                                #strange....code seems fine actually...debug! See test/data/download.pdf
+                                                                                                # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux?
             spec_id = hits.pop(0).split(' ')[1]
             j = 0
             for hh in hits:
@@ -69,8 +70,13 @@
                             name_tmp = ':'.join(cell[0].split(':')[1:])
                         else:
                             name_tmp = cell[0].split(':')[1]
+
+                        # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive
+                        # replaces of known cases by the same with a white space:
                         name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
                         name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
+                        name_tmp = name_tmp.replace('-, LC', '-, L C').replace('-, DC', '-, D C')
+
                         name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))
                         if name_tmp:
                             if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
--- a/rankfilter_GCMS/test/test_pdfread.py	Thu Jan 16 13:22:38 2014 +0100
+++ b/rankfilter_GCMS/test/test_pdfread.py	Fri Oct 24 12:52:56 2014 +0200
@@ -17,13 +17,20 @@
         '''
         Tests the reading and parsing of a NIST PDF file
         '''
-        [hitlist, hitlist_missed] = pdfread.getPDF(self.nist_pdf)
+        [hitlist, hitlist_missed] = pdfread.getPDF(self.nist_pdf, True)
         rows = [hitlist[row] for row in hitlist.keys()]
         data = [set(row) for row in zip(*rows)]
         expected_element = set(('12.3', ' Sucrose ', '14', 'undef', ' standards 2009', ' 660', 'not_def',
         '18495-0.142537-21284-2.26544e+07-135', '22.6544', ' 714'))
         self.failUnless(expected_element in data)
         self.failUnless(len(hitlist_missed) != 0)
+        '''
+        Check for last (dummy) hit:
+        Hit 6 : (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, LC21H52O6Si5;MF: 658; RMF: 658; Prob 15.6%; CAS: 30645-02-4; Lib: mainlib; ID: 37062.
+        '''
+        expected_element = set(['C21H52O6Si5', ' 30645-02-4', ' mainlib', '15.6', ' (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, L C21H52O6Si5', '7298-1-9580-1.29014e+07-9', ' 658', '12.9014', '37062'])
+        self.failUnless(expected_element in data)
+

 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.test_getPDF']
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/repository_dependencies.xml	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,4 @@
+<?xml version="1.0"?>
+<repositories description="Required metaMS dependencies.">
+  <repository toolshed="http://testtoolshed.g2.bx.psu.edu" name="prims_metabolomics_dependencies" owner="pieterlukasse" changeset_revision="62b3f4d6e015" />
+</repositories>
Binary file static/images/metaMS.png has changed
Binary file static/images/test/out/figure1.png has changed
Binary file static/images/test/out/figure2.png has changed
Binary file static/images/test/out/figure3.png has changed
Binary file static/images/test/out/figure4.png has changed
Binary file static/images/test/out2/fig_box/001.png has changed
Binary file static/images/test/out2/fig_box/002.png has changed
Binary file static/images/test/out2/fig_box/003.png has changed
Binary file static/images/test/out2/fig_box/004.png has changed
Binary file static/images/test/out2/fig_box/005.png has changed
Binary file static/images/test/out2/fig_box/006.png has changed
Binary file static/images/test/out2/fig_box/007.png has changed
Binary file static/images/test/out2/fig_box/008.png has changed
Binary file static/images/test/out2/fig_box/009.png has changed
Binary file static/images/test/out2/fig_box/010.png has changed
Binary file static/images/test/out2/fig_eic/001.png has changed
Binary file static/images/test/out2/fig_eic/002.png has changed
Binary file static/images/test/out2/fig_eic/003.png has changed
Binary file static/images/test/out2/fig_eic/004.png has changed
Binary file static/images/test/out2/fig_eic/005.png has changed
Binary file static/images/test/out2/fig_eic/006.png has changed
Binary file static/images/test/out2/fig_eic/007.png has changed
Binary file static/images/test/out2/fig_eic/008.png has changed
Binary file static/images/test/out2/fig_eic/009.png has changed
Binary file static/images/test/out2/fig_eic/010.png has changed
Binary file static/images/test/outplot1.png has changed
Binary file static/images/test/outplot2.png has changed
Binary file static/images/test/outplot3.png has changed
Binary file static/images/test/outplot4.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/static_resources/elements_and_masses.tab	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,104 @@
+Name	Atomic number	Chemical symbol	Relative atomic mass
+Hydrogen	1	H	1.01
+Helium	2	He	4
+Lithium	3	Li	6.94
+Beryllium	4	Be	9.01
+Boron	5	B	10.81
+Carbon	6	C	12.01
+Nitrogen	7	N	14.01
+Oxygen	8	O	16
+Fluorine	9	F	19
+Neon	10	Ne	20.18
+Sodium	11	Na	22.99
+Magnesium	12	Mg	24.31
+Aluminum	13	Al	26.98
+Silicon	14	Si	28.09
+Phosphorus	15	P	30.98
+Sulfur	16	S	32.06
+Chlorine	17	Cl	35.45
+Argon	18	Ar	39.95
+Potassium	19	K	39.1
+Calcium	20	Ca	40.08
+Scandium	21	Sc	44.96
+Titanium	22	Ti	47.9
+Vanadium	23	V	50.94
+Chromium	24	Cr	52
+Manganese	25	Mn	54.94
+Iron	26	Fe	55.85
+Cobalt	27	Co	58.93
+Nickel	28	Ni	58.71
+Copper	29	Cu	63.54
+Zinc	30	Zn	65.37
+Gallium	31	Ga	69.72
+Germanium	32	Ge	72.59
+Arsenic	33	As	74.99
+Selenium	34	Se	78.96
+Bromine	35	Br	79.91
+Krypton	36	Kr	83.8
+Rubidium	37	Rb	85.47
+Strontium	38	Sr	87.62
+Yttrium	39	Y	88.91
+Zirconium	40	Zr	91.22
+Niobium	41	Nb	92.91
+Molybdenum	42	Mo	95.94
+Technetium	43	Tc	96.91
+Ruthenium	44	Ru	101.07
+Rhodium	45	Rh	102.9
+Palladium	46	Pd	106.4
+Silver	47	Ag	107.87
+Cadmium	48	Cd	112.4
+Indium	49	In	114.82
+Tin	50	Sn	118.69
+Antimony	51	Sb	121.75
+Tellurium	52	Te	127.6
+Iodine	53	I	126.9
+Xenon	54	Xe	131.3
+Cesium	55	Cs	132.9
+Barium	56	Ba	137.34
+Lanthanum	57	La	138.91
+Cerium	58	Ce	140.12
+Praseodymium	59	Pr	140.91
+Neodymium	60	Nd	144.24
+Promethium	61	Pm	144.91
+Samarium	62	Sm	150.35
+Europium	63	Eu	151.96
+Gadolinium	64	Gd	157.25
+Terbium	65	Tb	158.92
+Dysprosium	66	Dy	162.5
+Holmium	67	Ho	164.93
+Erbium	68	Er	167.26
+Thulium	69	Tm	168.93
+Ytterbium	70	Yb	173.04
+Lutetium	71	Lu	174.97
+Hafnium	72	Hf	178.49
+Tantalum	73	Ta	180.95
+Wolfram	74	W	183.85
+Rhenium	75	Re	186.2
+Osmium	76	Os	190.2
+Iridium	77	Ir	192.22
+Platinum	78	Pt	195.09
+Gold	79	Au	196.97
+Mercury	80	Hg	200.59
+Thallium	81	Tl	204.37
+Lead	82	Pb	207.19
+Bismuth	83	Bi	208.98
+Polonium	84	Po	208.98
+Astatine	85	At	209.99
+Radon	86	Rn	222.02
+Francium	87	Fr	223.02
+Radium	88	Ra	226
+Actinium	89	Ac	227.03
+Thorium	90	Th	232.04
+Protactinium	91	Pa	231.04
+Uranium	92	U	238.03
+Neptunium	93	Np	237
+Plutonium	94	Pu	242
+Americium	95	Am	243.06
+Curium	96	Cm	247.07
+Berkelium	97	Bk	247.07
+Californium	98	Cf	251.08
+Einsteinium	99	Es	254.09
+Fermium	100	Fm	257.1
+Mendelevium	101	Md	257.1
+Nobelium	102	No	255.09
+Lawrencium	103	Lr	256.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/Copy of metaMS_cmd_interface.r	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,65 @@
+## =========== This is a copy that can be executed in R environment directly for interactive tests
+
+## read args:
+args <- "test"
+
+## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData"
+args.constructedDB <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/LCDBtest.RData"
+## data files, e.g. "E:/Rworkspace/metaMS/data/data.zip" (with e.g. .CDF files) and unzip output dir, e.g. "E:/"
+args.dataZip <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/extdata.zip"
+args.zipExtrDir <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/zipOut"
+## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings"
+args.settings <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/example_settings.txt"
+
+## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt"
+args.outAnnotationTable <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/out/annotationTable.txt"
+args.outLogFile <- "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/out/logfile.txt"
+
+library(metaMS)
+
+## load the constructed DB :
+tempEnv <- new.env()
+testDB <- load(args.constructedDB, envir=tempEnv)
+
+## load the data files from a zip file
+files <- unzip(args.dataZip, exdir=args.zipExtrDir)
+
+## load settings "script" into "customMetaMSsettings"
+source(args.settings, local=tempEnv)
+
+
+# Just to highlight: if you want to use more than one
+# trigger runLC:
+LC <- runLC(files, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, nSlaves=20, returnXset = TRUE)
+
+# write out runLC annotation results:
+write.table(LC$Annotation$annotation.table, args.outAnnotationTable, sep="\t",  row.names=FALSE)
+
+# the used constructed DB (write to log):
+sink( args.outLogFile )
+cat("Constructed DB info===============:")
+str(tempEnv[[testDB[1]]]$Info)
+cat("Constructed DB table===============:")
+sink()
+write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE,  row.names=FALSE)
+write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE,  row.names=FALSE)
+
+# report
+LC$xset@xcmsSet
+gt <- groups(LC$xset@xcmsSet)
+colnames(gt)
+groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3)
+if (length(groupidx1) > 0)
+{
+	eiccor <- getEIC(LC$xset@xcmsSet, groupidx = c(groupidx1))
+	eicraw <- getEIC(LC$xset@xcmsSet, groupidx = c(groupidx1), rt = "raw")
+	for (i in 1:length(groupidx1))
+	{
+		png( paste("E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/out/outplot", i,".png", sep="") )
+		plot(eiccor, LC$xset@xcmsSet, groupidx = i)
+		devname = dev.off()
+	}
+
+}
+
+
--- a/test/__init__.py	Thu Jan 16 13:22:38 2014 +0100
+++ b/test/__init__.py	Fri Oct 24 12:52:56 2014 +0200
@@ -1,1 +1,1 @@
-''' BRS GCMS Galaxy Tools Module '''
+''' unit tests '''
--- a/test/integration_tests.py	Thu Jan 16 13:22:38 2014 +0100
+++ b/test/integration_tests.py	Fri Oct 24 12:52:56 2014 +0200
@@ -119,7 +119,7 @@

         #Build up arguments and run
         input_txt = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")
-        library = resource_filename(__name__, "data/integration/Library_RI_DB_capillary_columns-noDuplicates.txt")
+        library = resource_filename(__name__, "../repositories/PRIMS-metabolomics/RI_DB_libraries/Library_RI_DB_capillary_columns-noDuplicates.txt")
         regress_model = resource_filename(__name__, "data/integration/regression_MODEL_for_columns.txt")
         sys.argv = ['test',
                     library,
--- a/test/test_export_to_metexp_tabular.py	Thu Jan 16 13:22:38 2014 +0100
+++ b/test/test_export_to_metexp_tabular.py	Fri Oct 24 12:52:56 2014 +0200
@@ -10,6 +10,27 @@
 class IntegrationTest(unittest.TestCase):


+    def test_MM_calculations(self):
+        '''
+        test the implemented method for MM calculations for
+        given chemical formulas
+        '''
+        export_to_metexp_tabular.init_elements_and_masses_map()
+
+        formula = "C8H18O3"
+        # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+        result = export_to_metexp_tabular.get_molecular_mass(formula)
+        self.assertEqual(162.26, result)
+
+        formula = "CH2O3Fe2Ni"
+        # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+        result = export_to_metexp_tabular.get_molecular_mass(formula)
+        self.assertAlmostEqual(232.44, result, 2)
+
+
+
+
+
     def test_combine_output_simple(self):
         '''
         comment me
@@ -28,7 +49,13 @@
         sys.argv = ['test',
                     rankfilter_and_caslookup_combined_file,
                     msclust_quantification_and_spectra_file,
-                    output_csv]
+                    output_csv,
+                    'tomato',
+                    'leafs',
+                    'test experiment',
+                    'pieter',
+                    'DB5 column']
+
         # Execute main function with arguments provided through sys.argv
         export_to_metexp_tabular.main()
--- a/test/test_library_lookup.py	Thu Jan 16 13:22:38 2014 +0100
+++ b/test/test_library_lookup.py	Fri Oct 24 12:52:56 2014 +0200
@@ -27,7 +27,7 @@
         Tests the 'create_lookup_table' function
         '''
         column_type = 'Capillary'
-        polarity = 'Semi-standard non-polar'
+        polarity = 'Semi-standard non-polar6'
         lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)
         self.assertFalse(False in [res[4] == 'Capillary' for res in lookup_dict['4177166']])
         self.assertEqual(['C51276336', '2,6-Dimethyl-octa-1,7-dien-3,6-diol', 'C10H18O2',
@@ -147,9 +147,9 @@
         '''
         Tests the match_library.py functionality
         '''
-        riqc_libs_dir = resource_filename(__name__, "../repositories")
+        riqc_libs_dir = resource_filename(__name__, "../repositories/PRIMS-metabolomics/RI_DB_libraries")
         get_library_files_output = match_library.get_directory_files(riqc_libs_dir)
-        self.assertEqual(4, len(get_library_files_output))
+        self.assertEqual(2, len(get_library_files_output))
         self.assertEqual("Library_RI_DB_capillary_columns-noDuplicates", get_library_files_output[0][0])
         #TODO change assert below to assert that the result is a file, so the test can run on other dirs as well:
         #self.assertEqual("E:\\workspace\\PRIMS-metabolomics\\python-tools\\tools\\GCMS\\test\\data\\riqc_libs\\RI DB library (capillary columns) Dec.2012.txt", get_library_files_output[0][1])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_mass_repos.py	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,62 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from MS import query_mass_repos
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+
+
+    def test_simple(self):
+        '''
+        Simple initial test
+        '''
+        # Create out folder
+        outdir = "output/query_mass_repos/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+
+        #     input_file = sys.argv[1]
+        #     molecular_mass_col = sys.argv[2]
+        #     repository_file = sys.argv[3]
+        #     mass_tolerance = float(sys.argv[4])
+        #     output_result = sys.argv[5]
+
+        input_file = resource_filename(__name__, "data/service_query_tabular.txt")
+
+        molecular_mass_col = "mass (Da)"
+        dblink_file = resource_filename(__name__, "data/MFSearcher ExactMassDB service.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")
+
+
+
+
+        sys.argv = ['test',
+                    input_file,
+                    molecular_mass_col,
+                    dblink_file,
+                    '0.001',
+                    'ms',
+                    output_result]
+
+        # Execute main function with arguments provided through sys.argv
+        query_mass_repos.main()
+
+
+
+
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_metexp.py	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,82 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import query_metexp
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+#     def test_MM_calculations(self):
+#         '''
+#         test the implemented method for MM calculations for
+#         given chemical formulas
+#         '''
+#         export_to_metexp_tabular.init_elements_and_masses_map()
+#
+#         formula = "C8H18O3"
+#         # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertEqual(162.26, result)
+#
+#         formula = "CH2O3Fe2Ni"
+#         # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertAlmostEqual(232.44, result, 2)
+#
+#
+#
+
+
+    def test_simple(self):
+        '''
+        Simple initial test
+        '''
+        # Create out folder
+        outdir = "output/metexp_query/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+
+        #         input_file = sys.argv[1]
+        #         molecular_mass_col = sys.argv[2]
+        #         formula_col = sys.argv[3]
+        #         metexp_dblink_file = sys.argv[4]
+        #         output_result = sys.argv[5]
+
+        input_file = resource_filename(__name__, "data/metexp_query_tabular.txt")
+        casid_col = "CAS"
+        formula_col = "FORMULA"
+        molecular_mass_col = "MM"
+        metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")
+
+        sys.argv = ['test',
+                    input_file,
+                    casid_col,
+                    formula_col,
+                    molecular_mass_col,
+                    metexp_dblink_file,
+                    'GC',
+                    output_result]
+
+        # Execute main function with arguments provided through sys.argv
+        query_metexp.main()
+
+        # TODO - asserts  (base them on DB being filled with test data form metexp unit test for upload method)
+        # PA
+
+
+
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_metexp_LARGE.py	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,79 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import query_metexp
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+#     def test_MM_calculations(self):
+#         '''
+#         test the implemented method for MM calculations for
+#         given chemical formulas
+#         '''
+#         export_to_metexp_tabular.init_elements_and_masses_map()
+#
+#         formula = "C8H18O3"
+#         # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertEqual(162.26, result)
+#
+#         formula = "CH2O3Fe2Ni"
+#         # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertAlmostEqual(232.44, result, 2)
+#
+#
+#
+
+
+    def test_large(self):
+        '''
+        Simple test, but on larger set, last test executed in 28s
+        '''
+        # Create out folder
+        outdir = "output/metexp_query/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+
+        #         input_file = sys.argv[1]
+        #         molecular_mass_col = sys.argv[2]
+        #         formula_col = sys.argv[3]
+        #         metexp_dblink_file = sys.argv[4]
+        #         output_result = sys.argv[5]
+
+        input_file = resource_filename(__name__, "data/metexp_query_tabular_large.txt")
+        casid_col = "CAS"
+        formula_col = "FORMULA"
+        molecular_mass_col = "MM"
+        metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added_LARGE.txt")
+
+        sys.argv = ['test',
+                    input_file,
+                    casid_col,
+                    formula_col,
+                    molecular_mass_col,
+                    metexp_dblink_file,
+                    'GC',
+                    output_result]
+
+        # Execute main function with arguments provided through sys.argv
+        query_metexp.main()
+
+
+
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_differential_analysis.r	Fri Oct 24 12:52:56 2014 +0200
@@ -0,0 +1,73 @@
+## read args:
+args <- commandArgs(TRUE)
+#cat("args <- \"\"\n")
+## a xcms xset saved as .RData
+args.xsetData <- args[1]
+#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep=""))
+
+args.class1 <- args[2]
+args.class2 <- args[3]
+#cat(paste("args.class1 <- \"", args[2], "\"\n", sep=""))
+#cat(paste("args.class2 <- \"", args[3], "\"\n", sep=""))
+
+args.topcount <- strtoi(args[4])
+#cat(paste("args.topcount <- ", args[4], "\n", sep=""))
+
+args.outTable <- args[5]
+args.outLogFile <- args[6]
+#cat(paste("args.outLogFile <- \"", args[6], "\"\n", sep=""))
+
+## report files
+args.htmlReportFile <- args[7]
+args.htmlReportFile.files_path <- args[8]
+#cat(paste("args.htmlReportFile <- \"", args[7], "\"\n", sep=""))
+#cat(paste("args.htmlReportFile.files_path <- \"", args[8], "\"\n", sep=""))
+
+# Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+msg <- file(args.outLogFile, open="wt")
+sink(msg, type="message")
+sink(msg, type="output")
+
+tryCatch(
+        {
+        	library(xcms)
+	        #library("R2HTML")
+
+			## load the constructed DB :
+			xcmsSet <- readRDS(args.xsetData)
+
+			# info: levels(xcmsSet@phenoData$class) also gives access to the class names
+
+			reporttab <- diffreport(xcmsSet, args.class1, args.class2, args.htmlReportFile.files_path, args.topcount, metlin = 0.15, h=480, w=640)
+
+			# write out tsv table:
+			write.table(reporttab, args.outTable, sep="\t", row.names=FALSE)
+
+			message("\nGenerating report.........")
+			# report
+			dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE)
+			# setwd(file.path(args.htmlReportFile.files_path))
+
+			cat("<html><body><h1>Differential analysis report</h1>", file= args.htmlReportFile)
+			#HTML(reporttab[1:args.topcount,], file= args.htmlReportFile)
+			figuresPath <- paste(args.htmlReportFile.files_path, "_eic", sep="")
+			message(figuresPath)
+			listOfFiles <- list.files(path = figuresPath)
+			for (i in 1:length(listOfFiles))
+			{
+				figureName <- listOfFiles[i]
+				# maybe we still need to copy the figures to the args.htmlReportFile.files_path
+				cat(paste("<img src='fig_eic/", figureName,"' />", sep=""), file= args.htmlReportFile, append=TRUE)
+			}
+
+			message("finished generating report")
+			cat("\nWarnings================:\n")
+			str( warnings() )
+		},
+        error=function(cond) {
+            sink(NULL, type="message") # default setting
+			sink(stderr(), type="output")
+            message("\nERROR: ===========\n")
+            print(cond)
+        }
+    )
\ No newline at end of file