| 1 | 1 #!/usr/bin/env python | 
|  | 2 # encoding: utf-8 | 
|  | 3 ''' | 
|  | 4 Module to query a set of accurate mass values detected by high-resolution mass spectrometers | 
|  | 5 against various repositories/services such as METabolomics EXPlorer database or the | 
|  | 6 MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/). | 
|  | 7 | 
|  | 8 It will take the input file and for each record it will query the | 
|  | 9 molecular mass in the selected repository/service. If one or more compounds are found | 
|  | 10 then extra information regarding these compounds is added to the output file. | 
|  | 11 | 
|  | 12 The output file is thus the input file enriched with information about | 
|  | 13 related items found in the selected repository/service. | 
|  | 14 | 
|  | 15 The service should implement the following interface: | 
|  | 16 | 
|  | 17 http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth   (txth means there is guaranteed to be a header line before the data) | 
|  | 18 | 
|  | 19 The output should be tab separated and should contain the following columns (in this order) | 
|  | 20 db-name    molecular-formula    dbe    formula-weight    id    description | 
|  | 21 | 
|  | 22 | 
|  | 23 ''' | 
|  | 24 import csv | 
|  | 25 import sys | 
|  | 26 import fileinput | 
|  | 27 import urllib2 | 
|  | 28 import time | 
|  | 29 from collections import OrderedDict | 
|  | 30 | 
|  | 31 __author__ = "Pieter Lukasse" | 
|  | 32 __contact__ = "pieter.lukasse@wur.nl" | 
|  | 33 __copyright__ = "Copyright, 2014, Plant Research International, WUR" | 
|  | 34 __license__ = "Apache v2" | 
|  | 35 | 
|  | 36 def _process_file(in_xsv, delim='\t'): | 
|  | 37     ''' | 
|  | 38     Generic method to parse a tab-separated file returning a dictionary with named columns | 
|  | 39     @param in_csv: input filename to be parsed | 
|  | 40     ''' | 
|  | 41     data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) | 
|  | 42     return _process_data(data) | 
|  | 43 | 
|  | 44 def _process_data(data): | 
|  | 45 | 
|  | 46     header = data.pop(0) | 
|  | 47     # Create dictionary with column name as key | 
|  | 48     output = OrderedDict() | 
|  | 49     for index in xrange(len(header)): | 
|  | 50         output[header[index]] = [row[index] for row in data] | 
|  | 51     return output | 
|  | 52 | 
|  | 53 | 
|  | 54 def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit): | 
|  | 55 | 
|  | 56     ''' | 
|  | 57     This method will iterate over the record in the input_data and | 
|  | 58     will enrich them with the related information found (if any) in the | 
|  | 59     chosen repository/service | 
|  | 60 | 
|  | 61     # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies | 
|  | 62     ''' | 
|  | 63     merged = [] | 
|  | 64 | 
|  | 65     for i in xrange(len(input_data[input_data.keys()[0]])): | 
|  | 66         # Get the record in same dictionary format as input_data, but containing | 
|  | 67         # a value at each column instead of a list of all values of all records: | 
|  | 68         input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) | 
|  | 69 | 
|  | 70         # read the molecular mass : | 
|  | 71         molecular_mass = input_data_record[molecular_mass_col] | 
|  | 72 | 
|  | 73 | 
|  | 74         # search for related records in repository/service: | 
|  | 75         data_found = None | 
|  | 76         if molecular_mass != "": | 
|  | 77             molecular_mass = float(molecular_mass) | 
|  | 78 | 
|  | 79             # 1- search for data around this MM: | 
|  | 80             query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth" | 
|  | 81 | 
|  | 82             data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") | 
|  | 83             data_type_found = "MM" | 
|  | 84 | 
|  | 85 | 
|  | 86         if data_found == None: | 
|  | 87             # If still nothing found, just add empty columns | 
|  | 88             extra_cols = ['', '','','','',''] | 
|  | 89         else: | 
|  | 90             # Add info found: | 
|  | 91             extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) | 
|  | 92 | 
|  | 93         # Take all data and merge it into a "flat"/simple array of values: | 
|  | 94         field_values_list = _merge_data(input_data_record, extra_cols) | 
|  | 95 | 
|  | 96         merged.append(field_values_list) | 
|  | 97 | 
|  | 98     # return the merged/enriched records: | 
|  | 99     return merged | 
|  | 100 | 
|  | 101 | 
|  | 102 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): | 
|  | 103     ''' | 
|  | 104     This method will go over the data found and will return a | 
|  | 105     list with the following items: | 
|  | 106     - details of hits found : | 
|  | 107          db-name    molecular-formula    dbe    formula-weight    id    description | 
|  | 108     - Link that executes same query | 
|  | 109 | 
|  | 110     ''' | 
|  | 111 | 
|  | 112     # set() makes a unique list: | 
|  | 113     db_name_set = [] | 
|  | 114     molecular_formula_set = [] | 
|  | 115     id_set = [] | 
|  | 116     description_set = [] | 
|  | 117 | 
|  | 118 | 
|  | 119     if 'db-name' in data_found: | 
|  | 120         db_name_set = set(data_found['db-name']) | 
|  | 121     elif '# db-name' in data_found: | 
|  | 122         db_name_set = set(data_found['# db-name']) | 
|  | 123     if 'molecular-formula' in data_found: | 
|  | 124         molecular_formula_set = set(data_found['molecular-formula']) | 
|  | 125     if 'id' in data_found: | 
|  | 126         id_set = set(data_found['id']) | 
|  | 127     if 'description' in data_found: | 
|  | 128         description_set = set(data_found['description']) | 
|  | 129 | 
|  | 130     result = [data_type_found, | 
|  | 131               _to_xsv(db_name_set), | 
|  | 132               _to_xsv(molecular_formula_set), | 
|  | 133               _to_xsv(id_set), | 
|  | 134               _to_xsv(description_set), | 
|  | 135               #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): | 
|  | 136               "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] | 
|  | 137     return result | 
|  | 138 | 
|  | 139 | 
|  | 140 def _to_xsv(data_set): | 
|  | 141     result = "" | 
|  | 142     for item in data_set: | 
|  | 143         result = result + str(item) + "|" | 
|  | 144     return result | 
|  | 145 | 
|  | 146 | 
|  | 147 def _fire_query_and_return_dict(url): | 
|  | 148     ''' | 
|  | 149     This method will fire the query as a web-service call and | 
|  | 150     return the results as a list of dictionary objects | 
|  | 151     ''' | 
|  | 152 | 
|  | 153     try: | 
|  | 154         data = urllib2.urlopen(url).read() | 
|  | 155 | 
|  | 156         # transform to dictionary: | 
|  | 157         result = [] | 
|  | 158         data_rows = data.split("\n") | 
|  | 159 | 
|  | 160         # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service): | 
|  | 161         data_rows_to_remove = [] | 
|  | 162         for data_row in data_rows: | 
|  | 163             if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row): | 
|  | 164                 data_rows_to_remove.append(data_row) | 
|  | 165 | 
|  | 166         for data_row in data_rows_to_remove: | 
|  | 167             data_rows.remove(data_row) | 
|  | 168 | 
|  | 169         # check if there is any data in the response: | 
|  | 170         if len(data_rows) <= 1 or data_rows[1].strip() == '': | 
|  | 171             # means there is only the header row...so no hits: | 
|  | 172             return None | 
|  | 173 | 
|  | 174         for data_row in data_rows: | 
|  | 175             if not data_row.strip() == '': | 
|  | 176                 row_as_list = _str_to_list(data_row, delimiter='\t') | 
|  | 177                 result.append(row_as_list) | 
|  | 178 | 
|  | 179         # return result processed into a dict: | 
|  | 180         return _process_data(result) | 
|  | 181 | 
|  | 182     except urllib2.HTTPError, e: | 
|  | 183         raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) | 
|  | 184     except urllib2.URLError, e: | 
|  | 185         raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ") | 
|  | 186 | 
|  | 187 def _str_to_list(data_row, delimiter='\t'): | 
|  | 188     result = [] | 
|  | 189     for column in data_row.split(delimiter): | 
|  | 190         result.append(column) | 
|  | 191     return result | 
|  | 192 | 
|  | 193 | 
|  | 194 # alternative: ? | 
|  | 195 #     s = requests.Session() | 
|  | 196 #     s.verify = False | 
|  | 197 #     #s.auth = (token01, token02) | 
|  | 198 #     resp = s.get(url, params={'name': 'anonymous'}, stream=True) | 
|  | 199 #     content = resp.content | 
|  | 200 #     # transform to dictionary: | 
|  | 201 | 
|  | 202 | 
|  | 203 | 
|  | 204 | 
|  | 205 def _merge_data(input_data_record, extra_cols): | 
|  | 206     ''' | 
|  | 207     Adds the extra information to the existing data record and returns | 
|  | 208     the combined new record. | 
|  | 209     ''' | 
|  | 210     record = [] | 
|  | 211     for column in input_data_record: | 
|  | 212         record.append(input_data_record[column]) | 
|  | 213 | 
|  | 214 | 
|  | 215     # add extra columns | 
|  | 216     for column in extra_cols: | 
|  | 217         record.append(column) | 
|  | 218 | 
|  | 219     return record | 
|  | 220 | 
|  | 221 | 
|  | 222 def _save_data(data_rows, headers, out_csv): | 
|  | 223     ''' | 
|  | 224     Writes tab-separated data to file | 
|  | 225     @param data_rows: dictionary containing merged/enriched dataset | 
|  | 226     @param out_csv: output csv file | 
|  | 227     ''' | 
|  | 228 | 
|  | 229     # Open output file for writing | 
|  | 230     outfile_single_handle = open(out_csv, 'wb') | 
|  | 231     output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | 
|  | 232 | 
|  | 233     # Write headers | 
|  | 234     output_single_handle.writerow(headers) | 
|  | 235 | 
|  | 236     # Write one line for each row | 
|  | 237     for data_row in data_rows: | 
|  | 238         output_single_handle.writerow(data_row) | 
|  | 239 | 
|  | 240 def _get_repository_URL(repository_file): | 
|  | 241     ''' | 
|  | 242     Read out and return the URL stored in the given file. | 
|  | 243     ''' | 
|  | 244     file_input = fileinput.input(repository_file) | 
|  | 245     try: | 
|  | 246         for line in file_input: | 
|  | 247             if line[0] != '#': | 
|  | 248                 # just return the first line that is not a comment line: | 
|  | 249                 return line | 
|  | 250     finally: | 
|  | 251         file_input.close() | 
|  | 252 | 
|  | 253 | 
|  | 254 def main(): | 
|  | 255     ''' | 
|  | 256     Query main function | 
|  | 257 | 
|  | 258     The input file can be any tabular file, as long as it contains a column for the molecular mass. | 
|  | 259     This column is then used to query against the chosen repository/service Database. | 
|  | 260     ''' | 
|  | 261     seconds_start = int(round(time.time())) | 
|  | 262 | 
|  | 263     input_file = sys.argv[1] | 
|  | 264     molecular_mass_col = sys.argv[2] | 
|  | 265     repository_file = sys.argv[3] | 
|  | 266     error_margin = float(sys.argv[4]) | 
|  | 267     margin_unit = sys.argv[5] | 
|  | 268     output_result = sys.argv[6] | 
|  | 269 | 
|  | 270     # Parse repository_file to find the URL to the service: | 
|  | 271     repository_dblink = _get_repository_URL(repository_file) | 
|  | 272 | 
|  | 273     # Parse tabular input file into dictionary/array: | 
|  | 274     input_data = _process_file(input_file) | 
|  | 275 | 
|  | 276     # Query data against repository : | 
|  | 277     enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit) | 
|  | 278     headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ', | 
|  | 279                                    'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits']  #TODO - add min and max formula weigth columns | 
|  | 280 | 
|  | 281     _save_data(enriched_data, headers, output_result) | 
|  | 282 | 
|  | 283     seconds_end = int(round(time.time())) | 
|  | 284     print "Took " + str(seconds_end - seconds_start) + " seconds" | 
|  | 285 | 
|  | 286 | 
|  | 287 | 
|  | 288 if __name__ == '__main__': | 
|  | 289     main() |