comparison metfrag.py @ 1:c1b168770b68 draft

planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit 5b384bd55e5bb4b0dc1daebd2ef5d3ee0e379b2e-dirty
author tomnl
date Fri, 31 May 2019 04:41:09 -0400
parents 75c805123b45
children d040e27b6225
comparison
equal deleted inserted replaced
0:75c805123b45 1:c1b168770b68
1 from __future__ import absolute_import, print_function
1 import argparse 2 import argparse
2 import csv 3 import csv
3 import os 4 import os
4 import sys 5 import sys
6 import six
7 import re
8 import random
9 import string
10 import shutil
11 import glob
12 import tempfile
13 import multiprocessing
14
15 from subprocess import call
16 from collections import defaultdict
17
5 print(sys.version) 18 print(sys.version)
6 19
7 parser = argparse.ArgumentParser() 20 parser = argparse.ArgumentParser()
8 parser.add_argument('--input') 21 parser.add_argument('--input_pth')
9 parser.add_argument('--db_local') 22 parser.add_argument('--result_pth', default='metfrag_result.csv')
10 parser.add_argument('--db_online') 23
11 parser.add_argument('--ppm') 24 parser.add_argument('--temp_dir')
12 parser.add_argument('--ppm_frag') 25 parser.add_argument('--polarity', default='pos')
13 parser.add_argument('--fragmasstol') 26 parser.add_argument('--minMSMSpeaks', default=1)
14 parser.add_argument('--polarity') 27
15 parser.add_argument('--results') 28 parser.add_argument('--MetFragDatabaseType', default='PubChem')
16 parser.add_argument('--threads') 29 parser.add_argument('--LocalDatabasePath', default='')
30 parser.add_argument('--LocalMetChemDatabaseServerIp', default='')
31
32 parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5)
33 parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10)
34 parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', default=0.001)
35 parser.add_argument('--NumberThreads', default=1)
36 parser.add_argument('--UnconnectedCompoundFilter', action='store_true')
37 parser.add_argument('--IsotopeFilter', action='store_true')
38
39 parser.add_argument('--FilterMinimumElements', default='')
40 parser.add_argument('--FilterMaximumElements', default='')
41 parser.add_argument('--FilterSmartsInclusionList', default='')
42 parser.add_argument('--FilterSmartsExclusionList', default='')
43 parser.add_argument('--FilterIncludedElements', default='')
44 parser.add_argument('--FilterExcludedElements', default='')
45 parser.add_argument('--FilterIncludedExclusiveElements', default='')
46
47 parser.add_argument('--score_thrshld', default=0)
48 parser.add_argument('--pctexplpeak_thrshld', default=0)
49 parser.add_argument('--schema')
50 parser.add_argument('--cores_top_level', default=1)
51 parser.add_argument('--chunks', default=1)
52 parser.add_argument('--meta_select_col', default='name')
53
54 parser.add_argument('--ScoreSuspectLists', default='')
55 parser.add_argument('--MetFragScoreTypes', default="FragmenterScore,OfflineMetFusionScore")
56 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0")
17 57
18 args = parser.parse_args() 58 args = parser.parse_args()
19 print args 59 print(args)
20 60
21 os.makedirs("tmet") 61 # Create temporary working directory
22 62 if args.temp_dir:
23 with open(args.input,"r") as infile: 63 wd = args.temp_dir
64 else:
65 wd = tempfile.mkdtemp()
66
67 if os.path.exists(wd):
68 shutil.rmtree(wd)
69 os.makedirs(wd)
70 else:
71 os.makedirs(wd)
72
73 ######################################################################
74 # Setup parameter dictionary
75 ######################################################################
76 paramd = defaultdict()
77
78 paramd["MetFragDatabaseType"] = args.MetFragDatabaseType
79
80 if args.MetFragDatabaseType == "LocalCSV":
81 paramd["LocalDatabasePath"] = args.LocalDatabasePath
82 elif args.MetFragDatabaseType == "MetChem":
83 paramd["LocalMetChemDatabase"] = "metchem"
84 paramd["LocalMetChemDatabasePortNumber"] = 5432
85 paramd["LocalMetChemDatabaseServerIp"] = args.LocalMetChemDatabaseServerIp
86 paramd["LocalMetChemDatabaseUser"] = "metchemro"
87 paramd["LocalMetChemDatabasePassword"] = "metchemro"
88
89 paramd["FragmentPeakMatchAbsoluteMassDeviation"] = args.FragmentPeakMatchAbsoluteMassDeviation
90 paramd["FragmentPeakMatchRelativeMassDeviation"] = args.FragmentPeakMatchRelativeMassDeviation
91 paramd["DatabaseSearchRelativeMassDeviation"] = args.DatabaseSearchRelativeMassDeviation
92 paramd["SampleName"] = ''
93 paramd["ResultsPath"] = os.path.join(wd)
94
95 if args.polarity == "pos":
96 paramd["IsPositiveIonMode"] = True
97 paramd["PrecursorIonModeDefault"] = "1"
98 paramd["PrecursorIonMode"] = "1"
99 paramd["nm_mass_diff_default"] = 1.007276
100 else:
101 paramd["IsPositiveIonMode"] = False
102 paramd["PrecursorIonModeDefault"] = "-1"
103 paramd["PrecursorIonMode"] = "-1"
104 paramd["nm_mass_diff_default"] = -1.007276
105
106 paramd["MetFragCandidateWriter"] = "CSV"
107 paramd["NumberThreads"] = args.NumberThreads
108
109 if args.ScoreSuspectLists:
110 paramd["ScoreSuspectLists"] = args.ScoreSuspectLists
111
112 paramd["MetFragScoreTypes"] = args.MetFragScoreTypes
113 paramd["MetFragScoreWeights"] = args.MetFragScoreWeights
114
115 dct_filter = defaultdict()
116 filterh = []
117
118 if args.UnconnectedCompoundFilter:
119 filterh.append('UnconnectedCompoundFilter')
120
121 if args.IsotopeFilter:
122 filterh.append('IsotopeFilter')
123
124 if args.FilterMinimumElements:
125 filterh.append('MinimumElementsFilter')
126 dct_filter['FilterMinimumElements'] = args.FilterMinimumElements
127
128 if args.FilterMaximumElements:
129 filterh.append('MaximumElementsFilter')
130 dct_filter['FilterMaximumElements'] = args.FilterMaximumElements
131
132 if args.FilterSmartsInclusionList:
133 filterh.append('SmartsSubstructureInclusionFilter')
134 dct_filter['FilterSmartsInclusionList'] = args.FilterSmartsInclusionList
135
136 if args.FilterSmartsExclusionList:
137 filterh.append('SmartsSubstructureExclusionFilter')
138 dct_filter['FilterSmartsExclusionList'] = args.FilterSmartsExclusionList
139
140 # My understanding is that both 'ElementInclusionExclusiveFilter' and 'ElementExclusionFilter' use
141 # 'FilterIncludedElements'
142 if args.FilterIncludedExclusiveElements:
143 filterh.append('ElementInclusionExclusiveFilter')
144 dct_filter['FilterIncludedElements'] = args.FilterIncludedExclusiveElements
145
146 if args.FilterIncludedElements:
147 filterh.append('ElementInclusionFilter')
148 dct_filter['FilterIncludedElements'] = args.FilterIncludedElements
149
150 if args.FilterExcludedElements:
151 filterh.append('ElementExclusionFilter')
152 dct_filter['FilterExcludedElements'] = args.FilterExcludedElements
153
154 if filterh:
155 fcmds = ','.join(filterh) + ' '
156 for k, v in six.iteritems(dct_filter):
157 fcmds += "{0}={1} ".format(str(k), str(v))
158
159 paramd["MetFragPreProcessingCandidateFilter"] = fcmds
160
161 print(paramd)
162
163 ######################################################################
164 # Setup regular expressions for MSP parsing dictionary
165 ######################################################################
166 regex_msp = {}
167 regex_msp['name'] = ['^Name(?:=|:)(.*)$']
168 regex_msp['polarity'] = ['^ion.*mode(?:=|:)(.*)$', '^ionization.*mode(?:=|:)(.*)$', '^polarity(?:=|:)(.*)$']
169 regex_msp['precursor_mz'] = ['^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', '^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
170 regex_msp['precursor_type'] = ['^precursor.*type(?:=|:)(.*)$', '^adduct(?:=|:)(.*)$', '^ADDUCTIONNAME(?:=|:)(.*)$']
171 regex_msp['num_peaks'] = ['^Num.*Peaks(?:=|:)\s*(\d*)$']
172 regex_msp['msp'] = ['^Name(?:=|:)(.*)$'] # Flag for standard MSP format
173
174 regex_massbank = {}
175 regex_massbank['name'] = ['^RECORD_TITLE:(.*)$']
176 regex_massbank['polarity'] = ['^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
177 regex_massbank['precursor_mz'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
178 regex_massbank['precursor_type'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
179 regex_massbank['num_peaks'] = ['^PK\$NUM_PEAK:\s+(\d*)']
180 regex_massbank['cols'] = ['^PK\$PEAK:\s+(.*)']
181 regex_massbank['massbank'] = ['^RECORD_TITLE:(.*)$'] # Flag for massbank format
182
183 if args.schema == 'msp':
184 meta_regex = regex_msp
185 elif args.schema == 'massbank':
186 meta_regex = regex_massbank
187 elif args.schema == 'auto':
188 # If auto we just check for all the available paramter names and then determine if Massbank or MSP based on
189 # the name parameter
190 meta_regex = {}
191 meta_regex.update(regex_massbank)
192 meta_regex['name'].extend(regex_msp['name'])
193 meta_regex['polarity'].extend(regex_msp['polarity'])
194 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
195 meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
196 meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
197 meta_regex['msp'] = regex_msp['msp']
198
199 print(meta_regex)
200
201
202
203 # this dictionary will store the meta data results form the MSp file
204 meta_info = {}
205
206
207 # function to extract the meta data using the regular expressions
208 def parse_meta(meta_regex, meta_info={}):
209 for k, regexes in six.iteritems(meta_regex):
210 for reg in regexes:
211 m = re.search(reg, line, re.IGNORECASE)
212 if m:
213 meta_info[k] = '-'.join(m.groups()).strip()
214 return meta_info
215
216
217 adduct_types = {
218 '[M+H]+': 1.007276,
219 '[M+NH4]+': 18.034374,
220 '[M+Na]+': 22.989218,
221 '[M+K]+': 38.963158,
222 '[M+CH3OH+H]+': 33.033489,
223 '[M+ACN+H]+': 42.033823,
224 '[M+ACN+Na]+': 64.015765,
225 '[M+2ACN+H]+': 83.06037,
226 '[M-H]-': -1.007276,
227 '[M+Cl]-': 34.969402,
228 }
229
230 ######################################################################
231 # Parse MSP file and run metfrag CLI
232 ######################################################################
233 # keep list of commands if performing in CLI in parallel
234 cmds = []
235 # keep a dictionary of all params
236 paramds = {}
237 # keep count of spectra (for uid)
238 spectrac = 0
239
240 with open(args.input_pth, "r") as infile:
24 numlines = 0 241 numlines = 0
25 for line in infile: 242 for line in infile:
26 line = line.strip() 243 line = line.strip()
244 print(line)
27 if numlines == 0: 245 if numlines == 0:
28 if "NAME" in line: 246 # =============== Extract metadata from MSP ========================
29 featid = line.split("NAME: ")[1] 247 meta_info = parse_meta(meta_regex, meta_info)
30 if "PRECURSORMZ" in line: 248 print(meta_info)
31 mz = float(line.split("PRECURSORMZ: ")[1]) 249 if ('massbank' in meta_info and 'cols' in meta_info) or ('msp' in meta_info and 'num_peaks' in meta_info):
32 if args.polarity=="pos": 250 print('check')
33 mz2 = mz-1.007276 251 numlines = int(meta_info['num_peaks'])
34 else:
35 mz2 = mz+1.007276
36 if "Num Peaks" in line:
37 numlines = int(line.split("Num Peaks: ")[1])
38 linesread = 0 252 linesread = 0
39 peaklist = [] 253 peaklist = []
40 else: 254
41 if linesread == numlines: 255 elif linesread < numlines:
42 numlines = 0 256 # =============== Extract peaks from MSP ==========================
43 #write spec file 257 line = tuple(line.split()) # .split() will split on any empty space (i.e. tab and space)
44 with open('./tmpspec.txt', 'w') as outfile: 258 # Keep only m/z and intensity, not relative intensity
45 for p in peaklist: 259 save_line = tuple(line[0].split() + line[1].split())
46 outfile.write(p[0]+"\t"+p[1]+"\n") 260 linesread += 1
47 #create commandline input 261 peaklist.append(save_line)
48 cmd_command = "PeakListPath=tmpspec.txt " 262
49 if args.db_local != "None": 263 elif linesread == numlines:
50 cmd_command += "MetFragDatabaseType=LocalCSV " 264 # =============== Get sample name and additional details for output =======
51 cmd_command += "LocalDatabasePath={0} ".format(args.db_local) 265 # use a unique uuid4 to keep track of processing (important for multicore)
266 #rd = str(uuid.uuid4())
267 spectrac += 1
268
269 # Get sample details (if possible to extract) e.g. if created as part of the msPurity pipeline)
270 # choose between getting additional details to add as columns as either all meta data from msp, just
271 # details from the record name (i.e. when using msPurity and we have the columns coded into the name) or
272 # just the spectra index (spectrac)
273 if args.meta_select_col == 'name':
274 # have additional column of just the name
275 paramd['additional_details'] = {'name': meta_info['name']}
276 elif args.meta_select_col == 'name_split':
277 # have additional columns split by "|" and then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1
278 sampled = {sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in
279 meta_info['name'].split("|")}
280 elif args.meta_select_col == 'all':
281 # have additional columns based on all the meta information extracted from the MSP
282 paramd['additional_details'] = meta_info
283 else:
284 # Just have and index of the spectra in the MSP file
285 paramd['additional_details'] = {'spectra_idx': spectrac}
286
287 paramd["SampleName"] = "{}_metfrag_result".format(spectrac)
288
289 # =============== Output peaks to txt file ==============================
290 numlines = 0
291 paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac))
292 print(paramd["PeakListPath"])
293 # write spec file
294 with open(paramd["PeakListPath"], 'w') as outfile:
295 for p in peaklist:
296 outfile.write(p[0] + "\t" + p[1] + "\n")
297
298 # =============== Update param based on MSP metadata ======================
299 # Replace param details with details from MSP if required
300 if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types:
301
302 nm = float(meta_info['precursor_mz']) + adduct_types[meta_info['precursor_type']]
303 paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0))
304 else:
305
306 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault']
307 nm = float(meta_info['precursor_mz']) + paramd['nm_mass_diff_default']
308
309 paramd["NeutralPrecursorMass"] = nm
310
311 # =============== Create CLI cmd for metfrag ===============================
312 cmd = "metfrag"
313 for k, v in six.iteritems(paramd):
314 if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']:
315 cmd += " {}={}".format(str(k), str(v))
316 paramds[paramd["SampleName"]] = paramd
317
318 # =============== Run metfrag ==============================================
319 # Filter before process with a minimum number of MS/MS peaks
320 if linesread >= float(args.minMSMSpeaks):
321
322 if int(args.cores_top_level) > 1:
323 cmds.append(cmd)
52 else: 324 else:
53 cmd_command += "MetFragDatabaseType={0} ".format(args.db_online) 325 print(cmd)
54 cmd_command += "FragmentPeakMatchAbsoluteMassDeviation={0} ".format(args.fragmasstol) 326 os.system(cmd)
55 cmd_command += "FragmentPeakMatchRelativeMassDeviation={0} ".format(args.ppm_frag) 327
56 cmd_command += "DatabaseSearchRelativeMassDeviation={0} ".format(args.ppm) 328 meta_info = {}
57 cmd_command += "NeutralPrecursorMass={0} ".format(mz2) 329
58 cmd_command += "SampleName={0}_metfrag ".format(featid) 330
59 cmd_command += "ResultsPath=./tmet/ " 331 def work(cmds):
60 if args.polarity == "pos": 332 return [os.system(cmd) for cmd in cmds]
61 cmd_command += "IsPositiveIonMode=True " 333
62 else: 334
63 cmd_command += "IsPositiveIonMode=False " 335 # Perform multiprocessing on command line call level
64 if args.polarity == "pos": ### Annotation information. Create a dict for the PrecurorIonModes?? 336 if int(args.cores_top_level) > 1:
65 cmd_command += "PrecursorIonMode=1 " 337 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in list(range(0, len(cmds), int(args.chunks)))]
66 else: 338 pool = multiprocessing.Pool(processes=int(args.cores_top_level))
67 cmd_command += "PrecursorIonMode=-1 " 339 pool.map(work, cmds_chunks)
68 cmd_command += "MetFragCandidateWriter=CSV " ## TSV not available 340 pool.close()
69 cmd_command += "NumberThreads={} ".format(args.threads) 341 pool.join()
70 # run Metfrag 342
71 print "metfrag {0}".format(cmd_command) 343 ######################################################################
72 os.system("metfrag {0}".format(cmd_command)) 344 # Concatenate and filter the output
73 else: 345 ######################################################################
74 line = tuple(line.split("\t")) 346 # outputs might have different headers. Need to get a list of all the headers before we start merging the files
75 linesread += 1 347 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, "*_metfrag_result.csv"))]
76 peaklist.append(line) 348 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv"))
77
78
79 #outputs might have different headers. Need to get a list of all the headers before we start merging the files
80 outfiles = sorted(os.listdir("./tmet"))
81 349
82 headers = [] 350 headers = []
83 c = 0 351 c = 0
84 for fname in outfiles: 352 for fn in outfiles:
85 with open("./tmet/"+fname) as infile: 353 with open(fn, 'r') as infile:
86 reader = csv.reader(infile) 354 reader = csv.reader(infile)
87 headers.extend(reader.next()) 355 if sys.version_info >= (3, 0):
356 headers.extend(next(reader))
357 else:
358 headers.extend(reader.next())
88 # check if file has any data rows 359 # check if file has any data rows
89 for i, row in enumerate(reader): 360 for i, row in enumerate(reader):
90 c+=1 361 c += 1
91 if i==1: 362 if i == 1:
92 break 363 break
93 364
94 # if no data rows (e.g. matches) then do not save an output and leave the program 365 # if no data rows (e.g. matches) then do not save an output and leave the program
95 if c==0: 366 if c == 0:
96 sys.exit() 367 sys.exit()
97 368
98 369 additional_detail_headers = ['sample_name']
99 print headers 370 for k, paramd in six.iteritems(paramds):
100 headers = ['UID'] + sorted(list(set(headers))) 371 additional_detail_headers = list(set(additional_detail_headers + list(paramd['additional_details'].keys())))
101 print headers 372
102 373 headers = additional_detail_headers + sorted(list(set(headers)))
103 #merge outputs 374
104 with open(args.results, 'a') as merged_outfile: 375 # merge outputs
105 376 with open(args.result_pth, 'a') as merged_outfile:
106 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, delimiter='\t') 377 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, delimiter='\t', quotechar='"',
378 quoting=csv.QUOTE_NONNUMERIC,)
107 dwriter.writeheader() 379 dwriter.writeheader()
108 380
109 for fname in outfiles: 381 for fn in outfiles:
110 fileid = os.path.basename(fname).split("_")[0] 382
111 with open("./tmet/"+fname) as infile: 383 with open(fn) as infile:
112 reader = csv.DictReader(infile, delimiter=',', quotechar='"') 384 reader = csv.DictReader(infile, delimiter=',', quotechar='"')
113 for line in reader: 385 for line in reader:
114 line['UID'] = fileid 386 bewrite = True
115 dwriter.writerow(line) 387 for key, value in line.items():
116 388 # Filter when no MS/MS peak matched
117 389 if key == "ExplPeaks":
390 if float(args.pctexplpeak_thrshld) > 0 and "NA" in value:
391 bewrite = False
392 # Filter with a score threshold
393 elif key == "Score":
394 if float(value) <= float(args.score_thrshld):
395 bewrite = False
396 elif key == "NoExplPeaks":
397 nbfindpeak = float(value)
398 elif key == "NumberPeaksUsed":
399 totpeaks = float(value)
400 # Filter with a relative number of peak matched
401 try:
402 pctexplpeak = nbfindpeak / totpeaks * 100
403 except ZeroDivisionError:
404 bewrite = False
405 else:
406 if pctexplpeak < float(args.pctexplpeak_thrshld):
407 bewrite = False
408
409 # Write the line if it pass all filters
410 if bewrite:
411 bfn = os.path.basename(fn)
412 bfn = bfn.replace(".csv", "")
413 ad = paramds[bfn]['additional_details']
414 line.update(ad)
415 line['sample_name'] = paramds[bfn]['SampleName']
416
417 dwriter.writerow(line)