Mercurial > repos > pieterlukasse > prims_metabolomics
comparison query_mass_repos.py @ 1:071a185c2ced
new tools
| author | pieter.lukasse@wur.nl |
|---|---|
| date | Fri, 24 Oct 2014 12:52:56 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:4b94bb2d381c | 1:071a185c2ced |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # encoding: utf-8 | |
| 3 ''' | |
| 4 Module to query a set of accurate mass values detected by high-resolution mass spectrometers | |
| 5 against various repositories/services such as METabolomics EXPlorer database or the | |
| 6 MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/). | |
| 7 | |
| 8 It will take the input file and for each record it will query the | |
| 9 molecular mass in the selected repository/service. If one or more compounds are found | |
| 10 then extra information regarding these compounds is added to the output file. | |
| 11 | |
| 12 The output file is thus the input file enriched with information about | |
| 13 related items found in the selected repository/service. | |
| 14 | |
| 15 The service should implement the following interface: | |
| 16 | |
| 17 http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data) | |
| 18 | |
| 19 The output should be tab separated and should contain the following columns (in this order) | |
| 20 db-name molecular-formula dbe formula-weight id description | |
| 21 | |
| 22 | |
| 23 ''' | |
| 24 import csv | |
| 25 import sys | |
| 26 import fileinput | |
| 27 import urllib2 | |
| 28 import time | |
| 29 from collections import OrderedDict | |
| 30 | |
| 31 __author__ = "Pieter Lukasse" | |
| 32 __contact__ = "pieter.lukasse@wur.nl" | |
| 33 __copyright__ = "Copyright, 2014, Plant Research International, WUR" | |
| 34 __license__ = "Apache v2" | |
| 35 | |
| 36 def _process_file(in_xsv, delim='\t'): | |
| 37 ''' | |
| 38 Generic method to parse a tab-separated file returning a dictionary with named columns | |
| 39 @param in_csv: input filename to be parsed | |
| 40 ''' | |
| 41 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) | |
| 42 return _process_data(data) | |
| 43 | |
| 44 def _process_data(data): | |
| 45 | |
| 46 header = data.pop(0) | |
| 47 # Create dictionary with column name as key | |
| 48 output = OrderedDict() | |
| 49 for index in xrange(len(header)): | |
| 50 output[header[index]] = [row[index] for row in data] | |
| 51 return output | |
| 52 | |
| 53 | |
| 54 def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit): | |
| 55 | |
| 56 ''' | |
| 57 This method will iterate over the record in the input_data and | |
| 58 will enrich them with the related information found (if any) in the | |
| 59 chosen repository/service | |
| 60 | |
| 61 # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies | |
| 62 ''' | |
| 63 merged = [] | |
| 64 | |
| 65 for i in xrange(len(input_data[input_data.keys()[0]])): | |
| 66 # Get the record in same dictionary format as input_data, but containing | |
| 67 # a value at each column instead of a list of all values of all records: | |
| 68 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) | |
| 69 | |
| 70 # read the molecular mass : | |
| 71 molecular_mass = input_data_record[molecular_mass_col] | |
| 72 | |
| 73 | |
| 74 # search for related records in repository/service: | |
| 75 data_found = None | |
| 76 if molecular_mass != "": | |
| 77 molecular_mass = float(molecular_mass) | |
| 78 | |
| 79 # 1- search for data around this MM: | |
| 80 query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth" | |
| 81 | |
| 82 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") | |
| 83 data_type_found = "MM" | |
| 84 | |
| 85 | |
| 86 if data_found == None: | |
| 87 # If still nothing found, just add empty columns | |
| 88 extra_cols = ['', '','','','',''] | |
| 89 else: | |
| 90 # Add info found: | |
| 91 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) | |
| 92 | |
| 93 # Take all data and merge it into a "flat"/simple array of values: | |
| 94 field_values_list = _merge_data(input_data_record, extra_cols) | |
| 95 | |
| 96 merged.append(field_values_list) | |
| 97 | |
| 98 # return the merged/enriched records: | |
| 99 return merged | |
| 100 | |
| 101 | |
| 102 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): | |
| 103 ''' | |
| 104 This method will go over the data found and will return a | |
| 105 list with the following items: | |
| 106 - details of hits found : | |
| 107 db-name molecular-formula dbe formula-weight id description | |
| 108 - Link that executes same query | |
| 109 | |
| 110 ''' | |
| 111 | |
| 112 # set() makes a unique list: | |
| 113 db_name_set = [] | |
| 114 molecular_formula_set = [] | |
| 115 id_set = [] | |
| 116 description_set = [] | |
| 117 | |
| 118 | |
| 119 if 'db-name' in data_found: | |
| 120 db_name_set = set(data_found['db-name']) | |
| 121 elif '# db-name' in data_found: | |
| 122 db_name_set = set(data_found['# db-name']) | |
| 123 if 'molecular-formula' in data_found: | |
| 124 molecular_formula_set = set(data_found['molecular-formula']) | |
| 125 if 'id' in data_found: | |
| 126 id_set = set(data_found['id']) | |
| 127 if 'description' in data_found: | |
| 128 description_set = set(data_found['description']) | |
| 129 | |
| 130 result = [data_type_found, | |
| 131 _to_xsv(db_name_set), | |
| 132 _to_xsv(molecular_formula_set), | |
| 133 _to_xsv(id_set), | |
| 134 _to_xsv(description_set), | |
| 135 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): | |
| 136 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] | |
| 137 return result | |
| 138 | |
| 139 | |
| 140 def _to_xsv(data_set): | |
| 141 result = "" | |
| 142 for item in data_set: | |
| 143 result = result + str(item) + "|" | |
| 144 return result | |
| 145 | |
| 146 | |
| 147 def _fire_query_and_return_dict(url): | |
| 148 ''' | |
| 149 This method will fire the query as a web-service call and | |
| 150 return the results as a list of dictionary objects | |
| 151 ''' | |
| 152 | |
| 153 try: | |
| 154 data = urllib2.urlopen(url).read() | |
| 155 | |
| 156 # transform to dictionary: | |
| 157 result = [] | |
| 158 data_rows = data.split("\n") | |
| 159 | |
| 160 # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service): | |
| 161 data_rows_to_remove = [] | |
| 162 for data_row in data_rows: | |
| 163 if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row): | |
| 164 data_rows_to_remove.append(data_row) | |
| 165 | |
| 166 for data_row in data_rows_to_remove: | |
| 167 data_rows.remove(data_row) | |
| 168 | |
| 169 # check if there is any data in the response: | |
| 170 if len(data_rows) <= 1 or data_rows[1].strip() == '': | |
| 171 # means there is only the header row...so no hits: | |
| 172 return None | |
| 173 | |
| 174 for data_row in data_rows: | |
| 175 if not data_row.strip() == '': | |
| 176 row_as_list = _str_to_list(data_row, delimiter='\t') | |
| 177 result.append(row_as_list) | |
| 178 | |
| 179 # return result processed into a dict: | |
| 180 return _process_data(result) | |
| 181 | |
| 182 except urllib2.HTTPError, e: | |
| 183 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) | |
| 184 except urllib2.URLError, e: | |
| 185 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ") | |
| 186 | |
| 187 def _str_to_list(data_row, delimiter='\t'): | |
| 188 result = [] | |
| 189 for column in data_row.split(delimiter): | |
| 190 result.append(column) | |
| 191 return result | |
| 192 | |
| 193 | |
| 194 # alternative: ? | |
| 195 # s = requests.Session() | |
| 196 # s.verify = False | |
| 197 # #s.auth = (token01, token02) | |
| 198 # resp = s.get(url, params={'name': 'anonymous'}, stream=True) | |
| 199 # content = resp.content | |
| 200 # # transform to dictionary: | |
| 201 | |
| 202 | |
| 203 | |
| 204 | |
| 205 def _merge_data(input_data_record, extra_cols): | |
| 206 ''' | |
| 207 Adds the extra information to the existing data record and returns | |
| 208 the combined new record. | |
| 209 ''' | |
| 210 record = [] | |
| 211 for column in input_data_record: | |
| 212 record.append(input_data_record[column]) | |
| 213 | |
| 214 | |
| 215 # add extra columns | |
| 216 for column in extra_cols: | |
| 217 record.append(column) | |
| 218 | |
| 219 return record | |
| 220 | |
| 221 | |
| 222 def _save_data(data_rows, headers, out_csv): | |
| 223 ''' | |
| 224 Writes tab-separated data to file | |
| 225 @param data_rows: dictionary containing merged/enriched dataset | |
| 226 @param out_csv: output csv file | |
| 227 ''' | |
| 228 | |
| 229 # Open output file for writing | |
| 230 outfile_single_handle = open(out_csv, 'wb') | |
| 231 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | |
| 232 | |
| 233 # Write headers | |
| 234 output_single_handle.writerow(headers) | |
| 235 | |
| 236 # Write one line for each row | |
| 237 for data_row in data_rows: | |
| 238 output_single_handle.writerow(data_row) | |
| 239 | |
| 240 def _get_repository_URL(repository_file): | |
| 241 ''' | |
| 242 Read out and return the URL stored in the given file. | |
| 243 ''' | |
| 244 file_input = fileinput.input(repository_file) | |
| 245 try: | |
| 246 for line in file_input: | |
| 247 if line[0] != '#': | |
| 248 # just return the first line that is not a comment line: | |
| 249 return line | |
| 250 finally: | |
| 251 file_input.close() | |
| 252 | |
| 253 | |
| 254 def main(): | |
| 255 ''' | |
| 256 Query main function | |
| 257 | |
| 258 The input file can be any tabular file, as long as it contains a column for the molecular mass. | |
| 259 This column is then used to query against the chosen repository/service Database. | |
| 260 ''' | |
| 261 seconds_start = int(round(time.time())) | |
| 262 | |
| 263 input_file = sys.argv[1] | |
| 264 molecular_mass_col = sys.argv[2] | |
| 265 repository_file = sys.argv[3] | |
| 266 error_margin = float(sys.argv[4]) | |
| 267 margin_unit = sys.argv[5] | |
| 268 output_result = sys.argv[6] | |
| 269 | |
| 270 # Parse repository_file to find the URL to the service: | |
| 271 repository_dblink = _get_repository_URL(repository_file) | |
| 272 | |
| 273 # Parse tabular input file into dictionary/array: | |
| 274 input_data = _process_file(input_file) | |
| 275 | |
| 276 # Query data against repository : | |
| 277 enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit) | |
| 278 headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ', | |
| 279 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns | |
| 280 | |
| 281 _save_data(enriched_data, headers, output_result) | |
| 282 | |
| 283 seconds_end = int(round(time.time())) | |
| 284 print "Took " + str(seconds_end - seconds_start) + " seconds" | |
| 285 | |
| 286 | |
| 287 | |
| 288 if __name__ == '__main__': | |
| 289 main() |
