comparison metfrag.py @ 8:9a3019c609d9 draft

planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit febaaca439248736775db9ad0a857e30463d10aa
author tomnl
date Fri, 13 Sep 2019 08:27:59 -0400
parents 0b3816a7a14b
children 5763234618d4
comparison
equal deleted inserted replaced
7:0b3816a7a14b 8:9a3019c609d9
115 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) 115 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
116 meta_regex['precursor_type'].extend(regex_msp['precursor_type']) 116 meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
117 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) 117 meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
118 meta_regex['msp'] = regex_msp['msp'] 118 meta_regex['msp'] = regex_msp['msp']
119 119
120 print(meta_regex) 120
121 121
122 adduct_types = { 122 adduct_types = {
123 '[M+H]+': 1.007276, 123 '[M+H]+': 1.007276,
124 '[M+NH4]+': 18.034374, 124 '[M+NH4]+': 18.034374,
125 '[M+Na]+': 22.989218, 125 '[M+Na]+': 22.989218,
133 '[M+HCOO]-': 44.99819, 133 '[M+HCOO]-': 44.99819,
134 '[M-H+HCOOH]-': 44.99819, # same as above but different style of writing adduct 134 '[M-H+HCOOH]-': 44.99819, # same as above but different style of writing adduct
135 '[M+CH3COO]-': 59.01385, 135 '[M+CH3COO]-': 59.01385,
136 '[M-H+CH3COOH]-': 59.01385 # same as above but different style of writing adduct 136 '[M-H+CH3COOH]-': 59.01385 # same as above but different style of writing adduct
137 } 137 }
138 inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()}
138 139
139 # function to extract the meta data using the regular expressions 140 # function to extract the meta data using the regular expressions
140 def parse_meta(meta_regex, meta_info={}): 141 def parse_meta(meta_regex, meta_info={}):
141 for k, regexes in six.iteritems(meta_regex): 142 for k, regexes in six.iteritems(meta_regex):
142 for reg in regexes: 143 for reg in regexes:
262 paramd['additional_details'] = meta_info 263 paramd['additional_details'] = meta_info
263 else: 264 else:
264 # Just have and index of the spectra in the MSP file 265 # Just have and index of the spectra in the MSP file
265 paramd['additional_details'] = {'spectra_idx': spectrac} 266 paramd['additional_details'] = {'spectra_idx': spectrac}
266 267
268
267 paramd["SampleName"] = "{}_metfrag_result".format(spectrac) 269 paramd["SampleName"] = "{}_metfrag_result".format(spectrac)
268 270
269 # =============== Output peaks to txt file ============================== 271 # =============== Output peaks to txt file ==============================
270 paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac)) 272 paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac))
271 273
275 outfile.write(p[0] + "\t" + p[1] + "\n") 277 outfile.write(p[0] + "\t" + p[1] + "\n")
276 278
277 # =============== Update param based on MSP metadata ====================== 279 # =============== Update param based on MSP metadata ======================
278 # Replace param details with details from MSP if required 280 # Replace param details with details from MSP if required
279 if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types: 281 if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types:
282 adduct = meta_info['precursor_type']
280 nm = float(meta_info['precursor_mz']) - adduct_types[meta_info['precursor_type']] 283 nm = float(meta_info['precursor_mz']) - adduct_types[meta_info['precursor_type']]
281 paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0)) 284 paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0))
282 elif not args.skip_invalid_adducts: 285 elif not args.skip_invalid_adducts:
286 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])]
283 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] 287 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault']
284 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] 288 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default']
285 else: 289 else:
286 print('Skipping {}'.format(paramd["SampleName"])) 290 print('Skipping {}'.format(paramd["SampleName"]))
287 return '', '' 291 return '', ''
288 292
293 paramd['additional_details']['adduct'] = adduct
289 paramd["NeutralPrecursorMass"] = nm 294 paramd["NeutralPrecursorMass"] = nm
290 295
291 # =============== Create CLI cmd for metfrag =============================== 296 # =============== Create CLI cmd for metfrag ===============================
292 cmd = "metfrag" 297 cmd = "metfrag"
293 for k, v in six.iteritems(paramd): 298 for k, v in six.iteritems(paramd):
294 if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']: 299 if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']:
295 cmd += " {}={}".format(str(k), str(v)) 300 cmd += " {}={}".format(str(k), str(v))
296 301
297 # =============== Run metfrag ============================================== 302 # =============== Run metfrag ==============================================
298 print(cmd) 303 #print(cmd)
299 # Filter before process with a minimum number of MS/MS peaks 304 # Filter before process with a minimum number of MS/MS peaks
300 if plinesread >= float(args.minMSMSpeaks): 305 if plinesread >= float(args.minMSMSpeaks):
301 306
302 if int(args.cores_top_level) == 1: 307 if int(args.cores_top_level) == 1:
303 os.system(cmd) 308 os.system(cmd)
309
310
304 311
305 return paramd, cmd 312 return paramd, cmd
306 313
307 314
308 def work(cmds): 315 def work(cmds):
414 421
415 additional_detail_headers = ['sample_name'] 422 additional_detail_headers = ['sample_name']
416 for k, paramd in six.iteritems(paramds): 423 for k, paramd in six.iteritems(paramds):
417 additional_detail_headers = list(set(additional_detail_headers + list(paramd['additional_details'].keys()))) 424 additional_detail_headers = list(set(additional_detail_headers + list(paramd['additional_details'].keys())))
418 425
426 # add inchikey if not already present (missing in metchem output)
427 if 'InChIKey' not in headers:
428 headers.append('InChIKey')
429
419 headers = additional_detail_headers + sorted(list(set(headers))) 430 headers = additional_detail_headers + sorted(list(set(headers)))
431
420 432
421 433
422 # Sort files nicely 434 # Sort files nicely
423 outfiles.sort(key = lambda s: int(re.match('^.*/(\d+)_metfrag_result.csv', s).group(1))) 435 outfiles.sort(key = lambda s: int(re.match('^.*/(\d+)_metfrag_result.csv', s).group(1)))
424 436
460 472
461 # Write the line if it pass all filters 473 # Write the line if it pass all filters
462 if bewrite: 474 if bewrite:
463 bfn = os.path.basename(fn) 475 bfn = os.path.basename(fn)
464 bfn = bfn.replace(".csv", "") 476 bfn = bfn.replace(".csv", "")
477 line['sample_name'] = paramds[bfn]['SampleName']
465 ad = paramds[bfn]['additional_details'] 478 ad = paramds[bfn]['additional_details']
479
480 if args.MetFragDatabaseType == "MetChem":
481 # for some reason the metchem database option does not report the full inchikey (at least
482 # in the Bham setup. This ensures we always get the fully inchikey
483 line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'], line['InChIKey2'], line['InChIKey3'])
484
466 line.update(ad) 485 line.update(ad)
467 line['sample_name'] = paramds[bfn]['SampleName']
468
469 dwriter.writerow(line) 486 dwriter.writerow(line)