comparison metfrag.py @ 9:5763234618d4 draft default tip

"planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit 6bc059c100f1f968d99c1d5ad5e7259e83d386b6"
author tomnl
date Fri, 04 Oct 2019 07:15:52 -0400
parents 9a3019c609d9
children
comparison
equal deleted inserted replaced
8:9a3019c609d9 9:5763234618d4
1 from __future__ import absolute_import, print_function 1 from __future__ import absolute_import, print_function
2
3 import ConfigParser
2 import argparse 4 import argparse
3 import csv 5 import csv
6 import glob
7 import multiprocessing
4 import os 8 import os
9 import re
10 import shutil
5 import sys 11 import sys
12 import tempfile
13 from collections import defaultdict
14
6 import six 15 import six
7 import re
8 import ConfigParser
9 import shutil
10 import glob
11 import tempfile
12 import multiprocessing
13
14 from subprocess import call
15 from collections import defaultdict
16 16
17 print(sys.version) 17 print(sys.version)
18 18
19 parser = argparse.ArgumentParser() 19 parser = argparse.ArgumentParser()
20 parser.add_argument('--input_pth') 20 parser.add_argument('--input_pth')
47 parser.add_argument('--pctexplpeak_thrshld', default=0) 47 parser.add_argument('--pctexplpeak_thrshld', default=0)
48 parser.add_argument('--schema') 48 parser.add_argument('--schema')
49 parser.add_argument('--cores_top_level', default=1) 49 parser.add_argument('--cores_top_level', default=1)
50 parser.add_argument('--chunks', default=1) 50 parser.add_argument('--chunks', default=1)
51 parser.add_argument('--meta_select_col', default='name') 51 parser.add_argument('--meta_select_col', default='name')
52 parser.add_argument('--skip_invalid_adducts', action='store_true') 52 parser.add_argument('--skip_invalid_adducts', action='store_true')
53 53
54 parser.add_argument('--ScoreSuspectLists', default='') 54 parser.add_argument('--ScoreSuspectLists', default='')
55 parser.add_argument('--MetFragScoreTypes', default="FragmenterScore,OfflineMetFusionScore") 55 parser.add_argument('--MetFragScoreTypes',
56 default="FragmenterScore,OfflineMetFusionScore")
56 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") 57 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0")
57
58 58
59 args = parser.parse_args() 59 args = parser.parse_args()
60 print(args) 60 print(args)
61 61
62
63 config = ConfigParser.ConfigParser() 62 config = ConfigParser.ConfigParser()
64 config.read(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) 63 config.read(
65 64 os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini'))
66 65
67 if os.stat(args.input_pth).st_size == 0: 66 if os.stat(args.input_pth).st_size == 0:
68 print('Input file empty') 67 print('Input file empty')
69 exit() 68 exit()
70
71 69
72 # Create temporary working directory 70 # Create temporary working directory
73 if args.temp_dir: 71 if args.temp_dir:
74 wd = args.temp_dir 72 wd = args.temp_dir
75 else: 73 else:
83 81
84 ###################################################################### 82 ######################################################################
85 # Setup regular expressions for MSP parsing dictionary 83 # Setup regular expressions for MSP parsing dictionary
86 ###################################################################### 84 ######################################################################
87 regex_msp = {} 85 regex_msp = {}
88 regex_msp['name'] = ['^Name(?:=|:)(.*)$'] 86 regex_msp['name'] = [r'^Name(?:=|:)(.*)$']
89 regex_msp['polarity'] = ['^ion.*mode(?:=|:)(.*)$', '^ionization.*mode(?:=|:)(.*)$', '^polarity(?:=|:)(.*)$'] 87 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$',
90 regex_msp['precursor_mz'] = ['^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', '^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] 88 r'^ionization.*mode(?:=|:)(.*)$',
91 regex_msp['precursor_type'] = ['^precursor.*type(?:=|:)(.*)$', '^adduct(?:=|:)(.*)$', '^ADDUCTIONNAME(?:=|:)(.*)$'] 89 r'^polarity(?:=|:)(.*)$']
92 regex_msp['num_peaks'] = ['^Num.*Peaks(?:=|:)\s*(\d*)$'] 90 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$',
93 regex_msp['msp'] = ['^Name(?:=|:)(.*)$'] # Flag for standard MSP format 91 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
92 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$',
93 r'^adduct(?:=|:)(.*)$',
94 r'^ADDUCTIONNAME(?:=|:)(.*)$']
95 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$']
96 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format
94 97
95 regex_massbank = {} 98 regex_massbank = {}
96 regex_massbank['name'] = ['^RECORD_TITLE:(.*)$'] 99 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$']
97 regex_massbank['polarity'] = ['^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] 100 regex_massbank['polarity'] = [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
98 regex_massbank['precursor_mz'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] 101 regex_massbank['precursor_mz'] = [
99 regex_massbank['precursor_type'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] 102 r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
100 regex_massbank['num_peaks'] = ['^PK\$NUM_PEAK:\s+(\d*)'] 103 regex_massbank['precursor_type'] = [
101 regex_massbank['cols'] = ['^PK\$PEAK:\s+(.*)'] 104 r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
102 regex_massbank['massbank'] = ['^RECORD_TITLE:(.*)$'] # Flag for massbank format 105 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)']
106 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)']
107 regex_massbank['massbank'] = [
108 r'^RECORD_TITLE:(.*)$'] # Flag for massbank format
103 109
104 if args.schema == 'msp': 110 if args.schema == 'msp':
105 meta_regex = regex_msp 111 meta_regex = regex_msp
106 elif args.schema == 'massbank': 112 elif args.schema == 'massbank':
107 meta_regex = regex_massbank 113 meta_regex = regex_massbank
108 elif args.schema == 'auto': 114 elif args.schema == 'auto':
109 # If auto we just check for all the available paramter names and then determine if Massbank or MSP based on 115 # If auto we just check for all the available paramter names and then
110 # the name parameter 116 # determine if Massbank or MSP based on the name parameter
111 meta_regex = {} 117 meta_regex = {}
112 meta_regex.update(regex_massbank) 118 meta_regex.update(regex_massbank)
113 meta_regex['name'].extend(regex_msp['name']) 119 meta_regex['name'].extend(regex_msp['name'])
114 meta_regex['polarity'].extend(regex_msp['polarity']) 120 meta_regex['polarity'].extend(regex_msp['polarity'])
115 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) 121 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
116 meta_regex['precursor_type'].extend(regex_msp['precursor_type']) 122 meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
117 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) 123 meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
118 meta_regex['msp'] = regex_msp['msp'] 124 meta_regex['msp'] = regex_msp['msp']
119 125 else:
120 126 sys.exit("No schema selected")
121 127
122 adduct_types = { 128 adduct_types = {
123 '[M+H]+': 1.007276, 129 '[M+H]+': 1.007276,
124 '[M+NH4]+': 18.034374, 130 '[M+NH4]+': 18.034374,
125 '[M+Na]+': 22.989218, 131 '[M+Na]+': 22.989218,
129 '[M+ACN+Na]+': 64.015765, 135 '[M+ACN+Na]+': 64.015765,
130 '[M+2ACN+H]+': 83.06037, 136 '[M+2ACN+H]+': 83.06037,
131 '[M-H]-': -1.007276, 137 '[M-H]-': -1.007276,
132 '[M+Cl]-': 34.969402, 138 '[M+Cl]-': 34.969402,
133 '[M+HCOO]-': 44.99819, 139 '[M+HCOO]-': 44.99819,
134 '[M-H+HCOOH]-': 44.99819, # same as above but different style of writing adduct 140 '[M-H+HCOOH]-': 44.99819,
141 # same as above but different style of writing adduct
135 '[M+CH3COO]-': 59.01385, 142 '[M+CH3COO]-': 59.01385,
136 '[M-H+CH3COOH]-': 59.01385 # same as above but different style of writing adduct 143 '[M-H+CH3COOH]-': 59.01385
144 # same as above but different style of writing adduct
137 } 145 }
138 inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()} 146 inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()}
139 147
148
140 # function to extract the meta data using the regular expressions 149 # function to extract the meta data using the regular expressions
141 def parse_meta(meta_regex, meta_info={}): 150 def parse_meta(meta_regex, meta_info=None):
151 if meta_info is None:
152 meta_info = {}
142 for k, regexes in six.iteritems(meta_regex): 153 for k, regexes in six.iteritems(meta_regex):
143 for reg in regexes: 154 for reg in regexes:
144 m = re.search(reg, line, re.IGNORECASE) 155 m = re.search(reg, line, re.IGNORECASE)
145 if m: 156 if m:
146 meta_info[k] = '-'.join(m.groups()).strip() 157 meta_info[k] = '-'.join(m.groups()).strip()
147 return meta_info 158 return meta_info
148 159
149 160
150
151 ###################################################################### 161 ######################################################################
152 # Setup parameter dictionary 162 # Setup parameter dictionary
153 ###################################################################### 163 ######################################################################
154 def init_paramd(args): 164 def init_paramd(args):
155 paramd = defaultdict() 165 paramd = defaultdict()
157 paramd["MetFragDatabaseType"] = args.MetFragDatabaseType 167 paramd["MetFragDatabaseType"] = args.MetFragDatabaseType
158 168
159 if args.MetFragDatabaseType == "LocalCSV": 169 if args.MetFragDatabaseType == "LocalCSV":
160 paramd["LocalDatabasePath"] = args.LocalDatabasePath 170 paramd["LocalDatabasePath"] = args.LocalDatabasePath
161 elif args.MetFragDatabaseType == "MetChem": 171 elif args.MetFragDatabaseType == "MetChem":
162 paramd["LocalMetChemDatabase"] = config.get('MetChem', 'LocalMetChemDatabase') 172 paramd["LocalMetChemDatabase"] = \
163 paramd["LocalMetChemDatabasePortNumber"] = config.get('MetChem', 'LocalMetChemDatabasePortNumber') 173 config.get('MetChem', 'LocalMetChemDatabase')
164 paramd["LocalMetChemDatabaseServerIp"] = args.LocalMetChemDatabaseServerIp 174 paramd["LocalMetChemDatabasePortNumber"] = \
165 paramd["LocalMetChemDatabaseUser"] = config.get('MetChem', 'LocalMetChemDatabaseUser') 175 config.get('MetChem', 'LocalMetChemDatabasePortNumber')
166 paramd["LocalMetChemDatabasePassword"] = config.get('MetChem', 'LocalMetChemDatabasePassword') 176 paramd["LocalMetChemDatabaseServerIp"] = \
167 177 args.LocalMetChemDatabaseServerIp
168 paramd["FragmentPeakMatchAbsoluteMassDeviation"] = args.FragmentPeakMatchAbsoluteMassDeviation 178 paramd["LocalMetChemDatabaseUser"] = \
169 paramd["FragmentPeakMatchRelativeMassDeviation"] = args.FragmentPeakMatchRelativeMassDeviation 179 config.get('MetChem', 'LocalMetChemDatabaseUser')
170 paramd["DatabaseSearchRelativeMassDeviation"] = args.DatabaseSearchRelativeMassDeviation 180 paramd["LocalMetChemDatabasePassword"] = \
181 config.get('MetChem', 'LocalMetChemDatabasePassword')
182
183 paramd["FragmentPeakMatchAbsoluteMassDeviation"] = \
184 args.FragmentPeakMatchAbsoluteMassDeviation
185 paramd["FragmentPeakMatchRelativeMassDeviation"] = \
186 args.FragmentPeakMatchRelativeMassDeviation
187 paramd["DatabaseSearchRelativeMassDeviation"] = \
188 args.DatabaseSearchRelativeMassDeviation
171 paramd["SampleName"] = '' 189 paramd["SampleName"] = ''
172 paramd["ResultsPath"] = os.path.join(wd) 190 paramd["ResultsPath"] = os.path.join(wd)
173 191
174 if args.polarity == "pos": 192 if args.polarity == "pos":
175 paramd["IsPositiveIonMode"] = True 193 paramd["IsPositiveIonMode"] = True
208 filterh.append('MaximumElementsFilter') 226 filterh.append('MaximumElementsFilter')
209 dct_filter['FilterMaximumElements'] = args.FilterMaximumElements 227 dct_filter['FilterMaximumElements'] = args.FilterMaximumElements
210 228
211 if args.FilterSmartsInclusionList: 229 if args.FilterSmartsInclusionList:
212 filterh.append('SmartsSubstructureInclusionFilter') 230 filterh.append('SmartsSubstructureInclusionFilter')
213 dct_filter['FilterSmartsInclusionList'] = args.FilterSmartsInclusionList 231 dct_filter[
232 'FilterSmartsInclusionList'] = args.FilterSmartsInclusionList
214 233
215 if args.FilterSmartsExclusionList: 234 if args.FilterSmartsExclusionList:
216 filterh.append('SmartsSubstructureExclusionFilter') 235 filterh.append('SmartsSubstructureExclusionFilter')
217 dct_filter['FilterSmartsExclusionList'] = args.FilterSmartsExclusionList 236 dct_filter[
218 237 'FilterSmartsExclusionList'] = args.FilterSmartsExclusionList
219 # My understanding is that both 'ElementInclusionExclusiveFilter' and 'ElementExclusionFilter' use 238
220 # 'FilterIncludedElements' 239 # My understanding is that both 'ElementInclusionExclusiveFilter'
240 # and 'ElementExclusionFilter' use 'FilterIncludedElements'
221 if args.FilterIncludedExclusiveElements: 241 if args.FilterIncludedExclusiveElements:
222 filterh.append('ElementInclusionExclusiveFilter') 242 filterh.append('ElementInclusionExclusiveFilter')
223 dct_filter['FilterIncludedElements'] = args.FilterIncludedExclusiveElements 243 dct_filter[
244 'FilterIncludedElements'] = args.FilterIncludedExclusiveElements
224 245
225 if args.FilterIncludedElements: 246 if args.FilterIncludedElements:
226 filterh.append('ElementInclusionFilter') 247 filterh.append('ElementInclusionFilter')
227 dct_filter['FilterIncludedElements'] = args.FilterIncludedElements 248 dct_filter['FilterIncludedElements'] = args.FilterIncludedElements
228 249
242 263
243 ###################################################################### 264 ######################################################################
244 # Function to run metfrag when all metainfo and peaks have been parsed 265 # Function to run metfrag when all metainfo and peaks have been parsed
245 ###################################################################### 266 ######################################################################
246 def run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types): 267 def run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types):
247 # Get sample details (if possible to extract) e.g. if created as part of the msPurity pipeline) 268 # Get sample details (if possible to extract) e.g. if created as part of
248 # choose between getting additional details to add as columns as either all meta data from msp, just 269 # the msPurity pipeline) choose between getting additional details to add
249 # details from the record name (i.e. when using msPurity and we have the columns coded into the name) or 270 # as columns as either all meta data from msp, just details from the
250 # just the spectra index (spectrac)] 271 # record name (i.e. when using msPurity and we have the columns coded into
272 # the name) or just the spectra index (spectrac)].
251 # Returns the parameters used and the command line call 273 # Returns the parameters used and the command line call
252 274
253 paramd = init_paramd(args) 275 paramd = init_paramd(args)
254 if args.meta_select_col == 'name': 276 if args.meta_select_col == 'name':
255 # have additional column of just the name 277 # have additional column of just the name
256 paramd['additional_details'] = {'name': meta_info['name']} 278 paramd['additional_details'] = {'name': meta_info['name']}
257 elif args.meta_select_col == 'name_split': 279 elif args.meta_select_col == 'name_split':
258 # have additional columns split by "|" and then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1 280 # have additional columns split by "|" and
259 paramd['additional_details'] = {sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in 281 # then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1
260 meta_info['name'].split("|")} 282 paramd['additional_details'] = {
283 sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in
284 meta_info['name'].split("|")}
261 elif args.meta_select_col == 'all': 285 elif args.meta_select_col == 'all':
262 # have additional columns based on all the meta information extracted from the MSP 286 # have additional columns based on all the meta information
287 # extracted from the MSP
263 paramd['additional_details'] = meta_info 288 paramd['additional_details'] = meta_info
264 else: 289 else:
265 # Just have and index of the spectra in the MSP file 290 # Just have and index of the spectra in the MSP file
266 paramd['additional_details'] = {'spectra_idx': spectrac} 291 paramd['additional_details'] = {'spectra_idx': spectrac}
267 292
268
269 paramd["SampleName"] = "{}_metfrag_result".format(spectrac) 293 paramd["SampleName"] = "{}_metfrag_result".format(spectrac)
270 294
271 # =============== Output peaks to txt file ============================== 295 # =============== Output peaks to txt file ==============================
272 paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac)) 296 paramd["PeakListPath"] = os.path.join(wd,
297 "{}_tmpspec.txt".format(spectrac))
273 298
274 # write spec file 299 # write spec file
275 with open(paramd["PeakListPath"], 'w') as outfile: 300 with open(paramd["PeakListPath"], 'w') as outfile:
276 for p in peaklist: 301 for p in peaklist:
277 outfile.write(p[0] + "\t" + p[1] + "\n") 302 outfile.write(p[0] + "\t" + p[1] + "\n")
278 303
279 # =============== Update param based on MSP metadata ====================== 304 # =============== Update param based on MSP metadata ======================
280 # Replace param details with details from MSP if required 305 # Replace param details with details from MSP if required
281 if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types: 306 if 'precursor_type' in meta_info and \
307 meta_info['precursor_type'] in adduct_types:
282 adduct = meta_info['precursor_type'] 308 adduct = meta_info['precursor_type']
283 nm = float(meta_info['precursor_mz']) - adduct_types[meta_info['precursor_type']] 309 nm = float(meta_info['precursor_mz']) - adduct_types[
284 paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0)) 310 meta_info['precursor_type']]
311 paramd["PrecursorIonMode"] = \
312 int(round(adduct_types[meta_info['precursor_type']], 0))
285 elif not args.skip_invalid_adducts: 313 elif not args.skip_invalid_adducts:
286 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] 314 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])]
287 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] 315 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault']
288 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] 316 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default']
289 else: 317 else:
291 return '', '' 319 return '', ''
292 320
293 paramd['additional_details']['adduct'] = adduct 321 paramd['additional_details']['adduct'] = adduct
294 paramd["NeutralPrecursorMass"] = nm 322 paramd["NeutralPrecursorMass"] = nm
295 323
296 # =============== Create CLI cmd for metfrag =============================== 324 # ============== Create CLI cmd for metfrag ===============================
297 cmd = "metfrag" 325 cmd = "metfrag"
298 for k, v in six.iteritems(paramd): 326 for k, v in six.iteritems(paramd):
299 if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']: 327 if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default',
328 'additional_details']:
300 cmd += " {}={}".format(str(k), str(v)) 329 cmd += " {}={}".format(str(k), str(v))
301 330
302 # =============== Run metfrag ============================================== 331 # ============== Run metfrag ==============================================
303 #print(cmd) 332 # print(cmd)
304 # Filter before process with a minimum number of MS/MS peaks 333 # Filter before process with a minimum number of MS/MS peaks
305 if plinesread >= float(args.minMSMSpeaks): 334 if plinesread >= float(args.minMSMSpeaks):
306 335
307 if int(args.cores_top_level) == 1: 336 if int(args.cores_top_level) == 1:
308 os.system(cmd) 337 os.system(cmd)
309 338
310
311
312 return paramd, cmd 339 return paramd, cmd
313 340
314 341
315 def work(cmds): 342 def work(cmds):
316 return [os.system(cmd) for cmd in cmds] 343 return [os.system(cmd) for cmd in cmds]
317
318 344
319 345
320 ###################################################################### 346 ######################################################################
321 # Parse MSP file and run metfrag CLI 347 # Parse MSP file and run metfrag CLI
322 ###################################################################### 348 ######################################################################
336 plinesread = 0 362 plinesread = 0
337 for line in infile: 363 for line in infile:
338 line = line.strip() 364 line = line.strip()
339 365
340 if pnumlines == 0: 366 if pnumlines == 0:
341 # =============== Extract metadata from MSP ======================== 367 # ============== Extract metadata from MSP ========================
342 meta_info = parse_meta(meta_regex, meta_info) 368 meta_info = parse_meta(meta_regex, meta_info)
343 369
344 if ('massbank' in meta_info and 'cols' in meta_info) or ('msp' in meta_info and 'num_peaks' in meta_info): 370 if ('massbank' in meta_info and 'cols' in meta_info) or (
345 371 'msp' in meta_info and 'num_peaks' in meta_info):
346 pnumlines = int(meta_info['num_peaks']) 372 pnumlines = int(meta_info['num_peaks'])
347 plinesread = 0 373 plinesread = 0
348 peaklist = [] 374 peaklist = []
349 375
350 elif plinesread < pnumlines: 376 elif plinesread < pnumlines:
351 # =============== Extract peaks from MSP ========================== 377 # ============== Extract peaks from MSP ==========================
352 line = tuple(line.split()) # .split() will split on any empty space (i.e. tab and space) 378 # .split() will split on any empty space (i.e. tab and space)
379 line = tuple(line.split())
353 # Keep only m/z and intensity, not relative intensity 380 # Keep only m/z and intensity, not relative intensity
354 save_line = tuple(line[0].split() + line[1].split()) 381 save_line = tuple(line[0].split() + line[1].split())
355 plinesread += 1 382 plinesread += 1
356 peaklist.append(save_line) 383 peaklist.append(save_line)
357 384
358 elif plinesread and plinesread == pnumlines: 385 elif plinesread and plinesread == pnumlines:
359 # =============== Get sample name and additional details for output ======= 386 # ======= Get sample name and additional details for output =======
360 spectrac += 1 387 spectrac += 1
361 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types) 388 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac,
389 adduct_types)
362 390
363 if paramd: 391 if paramd:
364 paramds[paramd["SampleName"]] = paramd 392 paramds[paramd["SampleName"]] = paramd
365 cmds.append(cmd) 393 cmds.append(cmd)
366 394
369 plinesread = 0 397 plinesread = 0
370 398
371 # end of file. Check if there is a MSP spectra to run metfrag on still 399 # end of file. Check if there is a MSP spectra to run metfrag on still
372 if plinesread and plinesread == pnumlines: 400 if plinesread and plinesread == pnumlines:
373 401
374 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac+1, adduct_types) 402 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac + 1,
403 adduct_types)
375 404
376 if paramd: 405 if paramd:
377 paramds[paramd["SampleName"]] = paramd 406 paramds[paramd["SampleName"]] = paramd
378 cmds.append(cmd) 407 cmds.append(cmd)
379 408
380
381
382
383 # Perform multiprocessing on command line call level 409 # Perform multiprocessing on command line call level
384 if int(args.cores_top_level) > 1: 410 if int(args.cores_top_level) > 1:
385 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in list(range(0, len(cmds), int(args.chunks)))] 411 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in
412 list(range(0, len(cmds), int(args.chunks)))]
386 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) 413 pool = multiprocessing.Pool(processes=int(args.cores_top_level))
387 pool.map(work, cmds_chunks) 414 pool.map(work, cmds_chunks)
388 pool.close() 415 pool.close()
389 pool.join() 416 pool.join()
390 417
391 ###################################################################### 418 ######################################################################
392 # Concatenate and filter the output 419 # Concatenate and filter the output
393 ###################################################################### 420 ######################################################################
394 # outputs might have different headers. Need to get a list of all the headers before we start merging the files 421 # outputs might have different headers. Need to get a list of all the
395 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, "*_metfrag_result.csv"))] 422 # headers before we start merging the files
423 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd,
424 # "*_metfrag_result.csv"))]
396 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) 425 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv"))
397 426
398 if len(outfiles) == 0: 427 if len(outfiles) == 0:
399 print('No results') 428 print('No results')
400 sys.exit() 429 sys.exit()
406 reader = csv.reader(infile) 435 reader = csv.reader(infile)
407 if sys.version_info >= (3, 0): 436 if sys.version_info >= (3, 0):
408 headers.extend(next(reader)) 437 headers.extend(next(reader))
409 else: 438 else:
410 headers.extend(reader.next()) 439 headers.extend(reader.next())
411 # check if file has any data rows 440 # check if file has any data rows
412 for i, row in enumerate(reader): 441 for i, row in enumerate(reader):
413 c += 1 442 c += 1
414 if i == 1: 443 if i == 1:
415 break 444 break
416 445
417 # if no data rows (e.g. matches) then do not save an output and leave the program 446 # if no data rows (e.g. matches) then do not save an
447 # output and leave the program
418 if c == 0: 448 if c == 0:
419 print('No results') 449 print('No results')
420 sys.exit() 450 sys.exit()
421 451
422 additional_detail_headers = ['sample_name'] 452 additional_detail_headers = ['sample_name']
423 for k, paramd in six.iteritems(paramds): 453 for k, paramd in six.iteritems(paramds):
424 additional_detail_headers = list(set(additional_detail_headers + list(paramd['additional_details'].keys()))) 454 additional_detail_headers = list(set(
455 additional_detail_headers + list(paramd['additional_details'].keys())))
425 456
426 # add inchikey if not already present (missing in metchem output) 457 # add inchikey if not already present (missing in metchem output)
427 if 'InChIKey' not in headers: 458 if 'InChIKey' not in headers:
428 headers.append('InChIKey') 459 headers.append('InChIKey')
429 460
430 headers = additional_detail_headers + sorted(list(set(headers))) 461 headers = additional_detail_headers + sorted(list(set(headers)))
431 462
432
433
434 # Sort files nicely 463 # Sort files nicely
435 outfiles.sort(key = lambda s: int(re.match('^.*/(\d+)_metfrag_result.csv', s).group(1))) 464 outfiles.sort(
465 key=lambda s: int(re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1)))
436 466
437 print(outfiles) 467 print(outfiles)
438
439 468
440 # merge outputs 469 # merge outputs
441 with open(args.result_pth, 'a') as merged_outfile: 470 with open(args.result_pth, 'a') as merged_outfile:
442 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, delimiter='\t', quotechar='"') 471 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers,
472 delimiter='\t', quotechar='"')
443 dwriter.writeheader() 473 dwriter.writeheader()
444 474
445 for fn in outfiles: 475 for fn in outfiles:
446 476
447 with open(fn) as infile: 477 with open(fn) as infile:
449 for line in reader: 479 for line in reader:
450 bewrite = True 480 bewrite = True
451 for key, value in line.items(): 481 for key, value in line.items():
452 # Filter when no MS/MS peak matched 482 # Filter when no MS/MS peak matched
453 if key == "ExplPeaks": 483 if key == "ExplPeaks":
454 if float(args.pctexplpeak_thrshld) > 0 and "NA" in value: 484 if float(args.pctexplpeak_thrshld) > 0 and \
485 "NA" in value:
455 bewrite = False 486 bewrite = False
456 # Filter with a score threshold 487 # Filter with a score threshold
457 elif key == "Score": 488 elif key == "Score":
458 if float(value) <= float(args.score_thrshld): 489 if float(value) <= float(args.score_thrshld):
459 bewrite = False 490 bewrite = False
475 bfn = os.path.basename(fn) 506 bfn = os.path.basename(fn)
476 bfn = bfn.replace(".csv", "") 507 bfn = bfn.replace(".csv", "")
477 line['sample_name'] = paramds[bfn]['SampleName'] 508 line['sample_name'] = paramds[bfn]['SampleName']
478 ad = paramds[bfn]['additional_details'] 509 ad = paramds[bfn]['additional_details']
479 510
480 if args.MetFragDatabaseType == "MetChem": 511 if args.MetFragDatabaseType == "MetChem":
481 # for some reason the metchem database option does not report the full inchikey (at least 512 # for some reason the metchem database option does
482 # in the Bham setup. This ensures we always get the fully inchikey 513 # not report the full inchikey (at least in the Bham
483 line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'], line['InChIKey2'], line['InChIKey3']) 514 # setup. This ensures we always get the fully inchikey
515 line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'],
516 line['InChIKey2'],
517 line['InChIKey3'])
484 518
485 line.update(ad) 519 line.update(ad)
486 dwriter.writerow(line) 520 dwriter.writerow(line)