comparison metfrag.py @ 2:d040e27b6225 draft

planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit 59631850110e60900d03f642c445cc35a36414b5
author tomnl
date Mon, 03 Jun 2019 11:20:55 -0400
parents c1b168770b68
children 5ee936e570a7
comparison
equal deleted inserted replaced
1:c1b168770b68 2:d040e27b6225
67 if os.path.exists(wd): 67 if os.path.exists(wd):
68 shutil.rmtree(wd) 68 shutil.rmtree(wd)
69 os.makedirs(wd) 69 os.makedirs(wd)
70 else: 70 else:
71 os.makedirs(wd) 71 os.makedirs(wd)
72
73 ######################################################################
74 # Setup parameter dictionary
75 ######################################################################
76 paramd = defaultdict()
77
78 paramd["MetFragDatabaseType"] = args.MetFragDatabaseType
79
80 if args.MetFragDatabaseType == "LocalCSV":
81 paramd["LocalDatabasePath"] = args.LocalDatabasePath
82 elif args.MetFragDatabaseType == "MetChem":
83 paramd["LocalMetChemDatabase"] = "metchem"
84 paramd["LocalMetChemDatabasePortNumber"] = 5432
85 paramd["LocalMetChemDatabaseServerIp"] = args.LocalMetChemDatabaseServerIp
86 paramd["LocalMetChemDatabaseUser"] = "metchemro"
87 paramd["LocalMetChemDatabasePassword"] = "metchemro"
88
89 paramd["FragmentPeakMatchAbsoluteMassDeviation"] = args.FragmentPeakMatchAbsoluteMassDeviation
90 paramd["FragmentPeakMatchRelativeMassDeviation"] = args.FragmentPeakMatchRelativeMassDeviation
91 paramd["DatabaseSearchRelativeMassDeviation"] = args.DatabaseSearchRelativeMassDeviation
92 paramd["SampleName"] = ''
93 paramd["ResultsPath"] = os.path.join(wd)
94
95 if args.polarity == "pos":
96 paramd["IsPositiveIonMode"] = True
97 paramd["PrecursorIonModeDefault"] = "1"
98 paramd["PrecursorIonMode"] = "1"
99 paramd["nm_mass_diff_default"] = 1.007276
100 else:
101 paramd["IsPositiveIonMode"] = False
102 paramd["PrecursorIonModeDefault"] = "-1"
103 paramd["PrecursorIonMode"] = "-1"
104 paramd["nm_mass_diff_default"] = -1.007276
105
106 paramd["MetFragCandidateWriter"] = "CSV"
107 paramd["NumberThreads"] = args.NumberThreads
108
109 if args.ScoreSuspectLists:
110 paramd["ScoreSuspectLists"] = args.ScoreSuspectLists
111
112 paramd["MetFragScoreTypes"] = args.MetFragScoreTypes
113 paramd["MetFragScoreWeights"] = args.MetFragScoreWeights
114
115 dct_filter = defaultdict()
116 filterh = []
117
118 if args.UnconnectedCompoundFilter:
119 filterh.append('UnconnectedCompoundFilter')
120
121 if args.IsotopeFilter:
122 filterh.append('IsotopeFilter')
123
124 if args.FilterMinimumElements:
125 filterh.append('MinimumElementsFilter')
126 dct_filter['FilterMinimumElements'] = args.FilterMinimumElements
127
128 if args.FilterMaximumElements:
129 filterh.append('MaximumElementsFilter')
130 dct_filter['FilterMaximumElements'] = args.FilterMaximumElements
131
132 if args.FilterSmartsInclusionList:
133 filterh.append('SmartsSubstructureInclusionFilter')
134 dct_filter['FilterSmartsInclusionList'] = args.FilterSmartsInclusionList
135
136 if args.FilterSmartsExclusionList:
137 filterh.append('SmartsSubstructureExclusionFilter')
138 dct_filter['FilterSmartsExclusionList'] = args.FilterSmartsExclusionList
139
140 # My understanding is that both 'ElementInclusionExclusiveFilter' and 'ElementExclusionFilter' use
141 # 'FilterIncludedElements'
142 if args.FilterIncludedExclusiveElements:
143 filterh.append('ElementInclusionExclusiveFilter')
144 dct_filter['FilterIncludedElements'] = args.FilterIncludedExclusiveElements
145
146 if args.FilterIncludedElements:
147 filterh.append('ElementInclusionFilter')
148 dct_filter['FilterIncludedElements'] = args.FilterIncludedElements
149
150 if args.FilterExcludedElements:
151 filterh.append('ElementExclusionFilter')
152 dct_filter['FilterExcludedElements'] = args.FilterExcludedElements
153
154 if filterh:
155 fcmds = ','.join(filterh) + ' '
156 for k, v in six.iteritems(dct_filter):
157 fcmds += "{0}={1} ".format(str(k), str(v))
158
159 paramd["MetFragPreProcessingCandidateFilter"] = fcmds
160
161 print(paramd)
162 72
163 ###################################################################### 73 ######################################################################
164 # Setup regular expressions for MSP parsing dictionary 74 # Setup regular expressions for MSP parsing dictionary
165 ###################################################################### 75 ######################################################################
166 regex_msp = {} 76 regex_msp = {}
196 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) 106 meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
197 meta_regex['msp'] = regex_msp['msp'] 107 meta_regex['msp'] = regex_msp['msp']
198 108
199 print(meta_regex) 109 print(meta_regex)
200 110
201
202
203 # this dictionary will store the meta data results form the MSp file
204 meta_info = {}
205
206
207 # function to extract the meta data using the regular expressions
208 def parse_meta(meta_regex, meta_info={}):
209 for k, regexes in six.iteritems(meta_regex):
210 for reg in regexes:
211 m = re.search(reg, line, re.IGNORECASE)
212 if m:
213 meta_info[k] = '-'.join(m.groups()).strip()
214 return meta_info
215
216
217 adduct_types = { 111 adduct_types = {
218 '[M+H]+': 1.007276, 112 '[M+H]+': 1.007276,
219 '[M+NH4]+': 18.034374, 113 '[M+NH4]+': 18.034374,
220 '[M+Na]+': 22.989218, 114 '[M+Na]+': 22.989218,
221 '[M+K]+': 38.963158, 115 '[M+K]+': 38.963158,
225 '[M+2ACN+H]+': 83.06037, 119 '[M+2ACN+H]+': 83.06037,
226 '[M-H]-': -1.007276, 120 '[M-H]-': -1.007276,
227 '[M+Cl]-': 34.969402, 121 '[M+Cl]-': 34.969402,
228 } 122 }
229 123
124 # function to extract the meta data using the regular expressions
125 def parse_meta(meta_regex, meta_info={}):
126 for k, regexes in six.iteritems(meta_regex):
127 for reg in regexes:
128 m = re.search(reg, line, re.IGNORECASE)
129 if m:
130 meta_info[k] = '-'.join(m.groups()).strip()
131 return meta_info
132
133
134
135 ######################################################################
136 # Setup parameter dictionary
137 ######################################################################
138 def init_paramd(args):
139 paramd = defaultdict()
140
141 paramd["MetFragDatabaseType"] = args.MetFragDatabaseType
142
143 if args.MetFragDatabaseType == "LocalCSV":
144 paramd["LocalDatabasePath"] = args.LocalDatabasePath
145 elif args.MetFragDatabaseType == "MetChem":
146 paramd["LocalMetChemDatabase"] = "metchem"
147 paramd["LocalMetChemDatabasePortNumber"] = 5432
148 paramd["LocalMetChemDatabaseServerIp"] = args.LocalMetChemDatabaseServerIp
149 paramd["LocalMetChemDatabaseUser"] = "metchemro"
150 paramd["LocalMetChemDatabasePassword"] = "metchemro"
151
152 paramd["FragmentPeakMatchAbsoluteMassDeviation"] = args.FragmentPeakMatchAbsoluteMassDeviation
153 paramd["FragmentPeakMatchRelativeMassDeviation"] = args.FragmentPeakMatchRelativeMassDeviation
154 paramd["DatabaseSearchRelativeMassDeviation"] = args.DatabaseSearchRelativeMassDeviation
155 paramd["SampleName"] = ''
156 paramd["ResultsPath"] = os.path.join(wd)
157
158 if args.polarity == "pos":
159 paramd["IsPositiveIonMode"] = True
160 paramd["PrecursorIonModeDefault"] = "1"
161 paramd["PrecursorIonMode"] = "1"
162 paramd["nm_mass_diff_default"] = 1.007276
163 else:
164 paramd["IsPositiveIonMode"] = False
165 paramd["PrecursorIonModeDefault"] = "-1"
166 paramd["PrecursorIonMode"] = "-1"
167 paramd["nm_mass_diff_default"] = -1.007276
168
169 paramd["MetFragCandidateWriter"] = "CSV"
170 paramd["NumberThreads"] = args.NumberThreads
171
172 if args.ScoreSuspectLists:
173 paramd["ScoreSuspectLists"] = args.ScoreSuspectLists
174
175 paramd["MetFragScoreTypes"] = args.MetFragScoreTypes
176 paramd["MetFragScoreWeights"] = args.MetFragScoreWeights
177
178 dct_filter = defaultdict()
179 filterh = []
180
181 if args.UnconnectedCompoundFilter:
182 filterh.append('UnconnectedCompoundFilter')
183
184 if args.IsotopeFilter:
185 filterh.append('IsotopeFilter')
186
187 if args.FilterMinimumElements:
188 filterh.append('MinimumElementsFilter')
189 dct_filter['FilterMinimumElements'] = args.FilterMinimumElements
190
191 if args.FilterMaximumElements:
192 filterh.append('MaximumElementsFilter')
193 dct_filter['FilterMaximumElements'] = args.FilterMaximumElements
194
195 if args.FilterSmartsInclusionList:
196 filterh.append('SmartsSubstructureInclusionFilter')
197 dct_filter['FilterSmartsInclusionList'] = args.FilterSmartsInclusionList
198
199 if args.FilterSmartsExclusionList:
200 filterh.append('SmartsSubstructureExclusionFilter')
201 dct_filter['FilterSmartsExclusionList'] = args.FilterSmartsExclusionList
202
203 # My understanding is that both 'ElementInclusionExclusiveFilter' and 'ElementExclusionFilter' use
204 # 'FilterIncludedElements'
205 if args.FilterIncludedExclusiveElements:
206 filterh.append('ElementInclusionExclusiveFilter')
207 dct_filter['FilterIncludedElements'] = args.FilterIncludedExclusiveElements
208
209 if args.FilterIncludedElements:
210 filterh.append('ElementInclusionFilter')
211 dct_filter['FilterIncludedElements'] = args.FilterIncludedElements
212
213 if args.FilterExcludedElements:
214 filterh.append('ElementExclusionFilter')
215 dct_filter['FilterExcludedElements'] = args.FilterExcludedElements
216
217 if filterh:
218 fcmds = ','.join(filterh) + ' '
219 for k, v in six.iteritems(dct_filter):
220 fcmds += "{0}={1} ".format(str(k), str(v))
221
222 paramd["MetFragPreProcessingCandidateFilter"] = fcmds
223
224 return paramd
225
226
227 ######################################################################
228 # Function to run metfrag when all metainfo and peaks have been parsed
229 ######################################################################
230 def run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types):
231 # Get sample details (if possible to extract) e.g. if created as part of the msPurity pipeline)
232 # choose between getting additional details to add as columns as either all meta data from msp, just
233 # details from the record name (i.e. when using msPurity and we have the columns coded into the name) or
234 # just the spectra index (spectrac)]
235 # Returns the parameters used and the command line call
236
237 paramd = init_paramd(args)
238 if args.meta_select_col == 'name':
239 # have additional column of just the name
240 paramd['additional_details'] = {'name': meta_info['name']}
241 elif args.meta_select_col == 'name_split':
242 # have additional columns split by "|" and then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1
243 paramd['additional_details'] = {sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in
244 meta_info['name'].split("|")}
245 elif args.meta_select_col == 'all':
246 # have additional columns based on all the meta information extracted from the MSP
247 paramd['additional_details'] = meta_info
248 else:
249 # Just have and index of the spectra in the MSP file
250 paramd['additional_details'] = {'spectra_idx': spectrac}
251
252 paramd["SampleName"] = "{}_metfrag_result".format(spectrac)
253
254 # =============== Output peaks to txt file ==============================
255 paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac))
256
257 # write spec file
258 with open(paramd["PeakListPath"], 'w') as outfile:
259 for p in peaklist:
260 outfile.write(p[0] + "\t" + p[1] + "\n")
261
262 # =============== Update param based on MSP metadata ======================
263 # Replace param details with details from MSP if required
264 if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types:
265
266 nm = float(meta_info['precursor_mz']) + adduct_types[meta_info['precursor_type']]
267 paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0))
268 else:
269
270 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault']
271 nm = float(meta_info['precursor_mz']) + paramd['nm_mass_diff_default']
272
273 paramd["NeutralPrecursorMass"] = nm
274
275 # =============== Create CLI cmd for metfrag ===============================
276 cmd = "metfrag"
277 for k, v in six.iteritems(paramd):
278 if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']:
279 cmd += " {}={}".format(str(k), str(v))
280
281 # =============== Run metfrag ==============================================
282 # Filter before process with a minimum number of MS/MS peaks
283 if plinesread >= float(args.minMSMSpeaks):
284
285 if int(args.cores_top_level) == 1:
286 os.system(cmd)
287
288 return paramd, cmd
289
290
291 def work(cmds):
292 return [os.system(cmd) for cmd in cmds]
293
294
295
230 ###################################################################### 296 ######################################################################
231 # Parse MSP file and run metfrag CLI 297 # Parse MSP file and run metfrag CLI
232 ###################################################################### 298 ######################################################################
233 # keep list of commands if performing in CLI in parallel 299 # keep list of commands if performing in CLI in parallel
234 cmds = [] 300 cmds = []
235 # keep a dictionary of all params 301 # keep a dictionary of all params
236 paramds = {} 302 paramds = {}
237 # keep count of spectra (for uid) 303 # keep count of spectra (for uid)
238 spectrac = 0 304 spectrac = 0
305 # this dictionary will store the meta data results form the MSp file
306 meta_info = {}
239 307
240 with open(args.input_pth, "r") as infile: 308 with open(args.input_pth, "r") as infile:
241 numlines = 0 309 # number of lines for the peaks
310 pnumlines = 0
311 # number of lines read for the peaks
312 plinesread = 0
242 for line in infile: 313 for line in infile:
243 line = line.strip() 314 line = line.strip()
244 print(line) 315
245 if numlines == 0: 316 if pnumlines == 0:
246 # =============== Extract metadata from MSP ======================== 317 # =============== Extract metadata from MSP ========================
247 meta_info = parse_meta(meta_regex, meta_info) 318 meta_info = parse_meta(meta_regex, meta_info)
248 print(meta_info) 319
249 if ('massbank' in meta_info and 'cols' in meta_info) or ('msp' in meta_info and 'num_peaks' in meta_info): 320 if ('massbank' in meta_info and 'cols' in meta_info) or ('msp' in meta_info and 'num_peaks' in meta_info):
250 print('check') 321
251 numlines = int(meta_info['num_peaks']) 322 pnumlines = int(meta_info['num_peaks'])
252 linesread = 0 323 plinesread = 0
253 peaklist = [] 324 peaklist = []
254 325
255 elif linesread < numlines: 326 elif plinesread < pnumlines:
256 # =============== Extract peaks from MSP ========================== 327 # =============== Extract peaks from MSP ==========================
257 line = tuple(line.split()) # .split() will split on any empty space (i.e. tab and space) 328 line = tuple(line.split()) # .split() will split on any empty space (i.e. tab and space)
258 # Keep only m/z and intensity, not relative intensity 329 # Keep only m/z and intensity, not relative intensity
259 save_line = tuple(line[0].split() + line[1].split()) 330 save_line = tuple(line[0].split() + line[1].split())
260 linesread += 1 331 plinesread += 1
261 peaklist.append(save_line) 332 peaklist.append(save_line)
262 333
263 elif linesread == numlines: 334 elif plinesread and plinesread == pnumlines:
264 # =============== Get sample name and additional details for output ======= 335 # =============== Get sample name and additional details for output =======
265 # use a unique uuid4 to keep track of processing (important for multicore)
266 #rd = str(uuid.uuid4())
267 spectrac += 1 336 spectrac += 1
268 337 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types)
269 # Get sample details (if possible to extract) e.g. if created as part of the msPurity pipeline) 338
270 # choose between getting additional details to add as columns as either all meta data from msp, just
271 # details from the record name (i.e. when using msPurity and we have the columns coded into the name) or
272 # just the spectra index (spectrac)
273 if args.meta_select_col == 'name':
274 # have additional column of just the name
275 paramd['additional_details'] = {'name': meta_info['name']}
276 elif args.meta_select_col == 'name_split':
277 # have additional columns split by "|" and then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1
278 sampled = {sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in
279 meta_info['name'].split("|")}
280 elif args.meta_select_col == 'all':
281 # have additional columns based on all the meta information extracted from the MSP
282 paramd['additional_details'] = meta_info
283 else:
284 # Just have and index of the spectra in the MSP file
285 paramd['additional_details'] = {'spectra_idx': spectrac}
286
287 paramd["SampleName"] = "{}_metfrag_result".format(spectrac)
288
289 # =============== Output peaks to txt file ==============================
290 numlines = 0
291 paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac))
292 print(paramd["PeakListPath"])
293 # write spec file
294 with open(paramd["PeakListPath"], 'w') as outfile:
295 for p in peaklist:
296 outfile.write(p[0] + "\t" + p[1] + "\n")
297
298 # =============== Update param based on MSP metadata ======================
299 # Replace param details with details from MSP if required
300 if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types:
301
302 nm = float(meta_info['precursor_mz']) + adduct_types[meta_info['precursor_type']]
303 paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0))
304 else:
305
306 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault']
307 nm = float(meta_info['precursor_mz']) + paramd['nm_mass_diff_default']
308
309 paramd["NeutralPrecursorMass"] = nm
310
311 # =============== Create CLI cmd for metfrag ===============================
312 cmd = "metfrag"
313 for k, v in six.iteritems(paramd):
314 if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']:
315 cmd += " {}={}".format(str(k), str(v))
316 paramds[paramd["SampleName"]] = paramd 339 paramds[paramd["SampleName"]] = paramd
317 340 cmds.append(cmd)
318 # =============== Run metfrag ==============================================
319 # Filter before process with a minimum number of MS/MS peaks
320 if linesread >= float(args.minMSMSpeaks):
321
322 if int(args.cores_top_level) > 1:
323 cmds.append(cmd)
324 else:
325 print(cmd)
326 os.system(cmd)
327 341
328 meta_info = {} 342 meta_info = {}
329 343 pnumlines = 0
330 344 plinesread = 0
331 def work(cmds): 345
332 return [os.system(cmd) for cmd in cmds] 346 # end of file. Check if there is a MSP spectra to run metfrag on still
347 if plinesread and plinesread == pnumlines:
348
349 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac+1, adduct_types)
350
351 paramds[paramd["SampleName"]] = paramd
352 cmds.append(cmd)
353
354
355
333 356
334 357
335 # Perform multiprocessing on command line call level 358 # Perform multiprocessing on command line call level
336 if int(args.cores_top_level) > 1: 359 if int(args.cores_top_level) > 1:
337 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in list(range(0, len(cmds), int(args.chunks)))] 360 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in list(range(0, len(cmds), int(args.chunks)))]
369 additional_detail_headers = ['sample_name'] 392 additional_detail_headers = ['sample_name']
370 for k, paramd in six.iteritems(paramds): 393 for k, paramd in six.iteritems(paramds):
371 additional_detail_headers = list(set(additional_detail_headers + list(paramd['additional_details'].keys()))) 394 additional_detail_headers = list(set(additional_detail_headers + list(paramd['additional_details'].keys())))
372 395
373 headers = additional_detail_headers + sorted(list(set(headers))) 396 headers = additional_detail_headers + sorted(list(set(headers)))
397
398
399 # Sort files nicely
400 outfiles.sort(key = lambda s: int(re.match('^.*/(\d+)_metfrag_result.csv', s).group(1)))
401
402 print(outfiles)
403
374 404
375 # merge outputs 405 # merge outputs
376 with open(args.result_pth, 'a') as merged_outfile: 406 with open(args.result_pth, 'a') as merged_outfile:
377 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, delimiter='\t', quotechar='"', 407 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, delimiter='\t', quotechar='"',
378 quoting=csv.QUOTE_NONNUMERIC,) 408 quoting=csv.QUOTE_NONNUMERIC,)