Mercurial > repos > tomnl > metfrag
comparison metfrag.py @ 9:5763234618d4 draft default tip
"planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit 6bc059c100f1f968d99c1d5ad5e7259e83d386b6"
| author | tomnl |
|---|---|
| date | Fri, 04 Oct 2019 07:15:52 -0400 |
| parents | 9a3019c609d9 |
| children |
comparison
equal
deleted
inserted
replaced
| 8:9a3019c609d9 | 9:5763234618d4 |
|---|---|
| 1 from __future__ import absolute_import, print_function | 1 from __future__ import absolute_import, print_function |
| 2 | |
| 3 import ConfigParser | |
| 2 import argparse | 4 import argparse |
| 3 import csv | 5 import csv |
| 6 import glob | |
| 7 import multiprocessing | |
| 4 import os | 8 import os |
| 9 import re | |
| 10 import shutil | |
| 5 import sys | 11 import sys |
| 12 import tempfile | |
| 13 from collections import defaultdict | |
| 14 | |
| 6 import six | 15 import six |
| 7 import re | |
| 8 import ConfigParser | |
| 9 import shutil | |
| 10 import glob | |
| 11 import tempfile | |
| 12 import multiprocessing | |
| 13 | |
| 14 from subprocess import call | |
| 15 from collections import defaultdict | |
| 16 | 16 |
| 17 print(sys.version) | 17 print(sys.version) |
| 18 | 18 |
| 19 parser = argparse.ArgumentParser() | 19 parser = argparse.ArgumentParser() |
| 20 parser.add_argument('--input_pth') | 20 parser.add_argument('--input_pth') |
| 47 parser.add_argument('--pctexplpeak_thrshld', default=0) | 47 parser.add_argument('--pctexplpeak_thrshld', default=0) |
| 48 parser.add_argument('--schema') | 48 parser.add_argument('--schema') |
| 49 parser.add_argument('--cores_top_level', default=1) | 49 parser.add_argument('--cores_top_level', default=1) |
| 50 parser.add_argument('--chunks', default=1) | 50 parser.add_argument('--chunks', default=1) |
| 51 parser.add_argument('--meta_select_col', default='name') | 51 parser.add_argument('--meta_select_col', default='name') |
| 52 parser.add_argument('--skip_invalid_adducts', action='store_true') | 52 parser.add_argument('--skip_invalid_adducts', action='store_true') |
| 53 | 53 |
| 54 parser.add_argument('--ScoreSuspectLists', default='') | 54 parser.add_argument('--ScoreSuspectLists', default='') |
| 55 parser.add_argument('--MetFragScoreTypes', default="FragmenterScore,OfflineMetFusionScore") | 55 parser.add_argument('--MetFragScoreTypes', |
| 56 default="FragmenterScore,OfflineMetFusionScore") | |
| 56 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") | 57 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") |
| 57 | |
| 58 | 58 |
| 59 args = parser.parse_args() | 59 args = parser.parse_args() |
| 60 print(args) | 60 print(args) |
| 61 | 61 |
| 62 | |
| 63 config = ConfigParser.ConfigParser() | 62 config = ConfigParser.ConfigParser() |
| 64 config.read(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) | 63 config.read( |
| 65 | 64 os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) |
| 66 | 65 |
| 67 if os.stat(args.input_pth).st_size == 0: | 66 if os.stat(args.input_pth).st_size == 0: |
| 68 print('Input file empty') | 67 print('Input file empty') |
| 69 exit() | 68 exit() |
| 70 | |
| 71 | 69 |
| 72 # Create temporary working directory | 70 # Create temporary working directory |
| 73 if args.temp_dir: | 71 if args.temp_dir: |
| 74 wd = args.temp_dir | 72 wd = args.temp_dir |
| 75 else: | 73 else: |
| 83 | 81 |
| 84 ###################################################################### | 82 ###################################################################### |
| 85 # Setup regular expressions for MSP parsing dictionary | 83 # Setup regular expressions for MSP parsing dictionary |
| 86 ###################################################################### | 84 ###################################################################### |
| 87 regex_msp = {} | 85 regex_msp = {} |
| 88 regex_msp['name'] = ['^Name(?:=|:)(.*)$'] | 86 regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] |
| 89 regex_msp['polarity'] = ['^ion.*mode(?:=|:)(.*)$', '^ionization.*mode(?:=|:)(.*)$', '^polarity(?:=|:)(.*)$'] | 87 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', |
| 90 regex_msp['precursor_mz'] = ['^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', '^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] | 88 r'^ionization.*mode(?:=|:)(.*)$', |
| 91 regex_msp['precursor_type'] = ['^precursor.*type(?:=|:)(.*)$', '^adduct(?:=|:)(.*)$', '^ADDUCTIONNAME(?:=|:)(.*)$'] | 89 r'^polarity(?:=|:)(.*)$'] |
| 92 regex_msp['num_peaks'] = ['^Num.*Peaks(?:=|:)\s*(\d*)$'] | 90 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', |
| 93 regex_msp['msp'] = ['^Name(?:=|:)(.*)$'] # Flag for standard MSP format | 91 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] |
| 92 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', | |
| 93 r'^adduct(?:=|:)(.*)$', | |
| 94 r'^ADDUCTIONNAME(?:=|:)(.*)$'] | |
| 95 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] | |
| 96 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format | |
| 94 | 97 |
| 95 regex_massbank = {} | 98 regex_massbank = {} |
| 96 regex_massbank['name'] = ['^RECORD_TITLE:(.*)$'] | 99 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] |
| 97 regex_massbank['polarity'] = ['^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] | 100 regex_massbank['polarity'] = [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] |
| 98 regex_massbank['precursor_mz'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] | 101 regex_massbank['precursor_mz'] = [ |
| 99 regex_massbank['precursor_type'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] | 102 r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] |
| 100 regex_massbank['num_peaks'] = ['^PK\$NUM_PEAK:\s+(\d*)'] | 103 regex_massbank['precursor_type'] = [ |
| 101 regex_massbank['cols'] = ['^PK\$PEAK:\s+(.*)'] | 104 r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] |
| 102 regex_massbank['massbank'] = ['^RECORD_TITLE:(.*)$'] # Flag for massbank format | 105 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] |
| 106 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] | |
| 107 regex_massbank['massbank'] = [ | |
| 108 r'^RECORD_TITLE:(.*)$'] # Flag for massbank format | |
| 103 | 109 |
| 104 if args.schema == 'msp': | 110 if args.schema == 'msp': |
| 105 meta_regex = regex_msp | 111 meta_regex = regex_msp |
| 106 elif args.schema == 'massbank': | 112 elif args.schema == 'massbank': |
| 107 meta_regex = regex_massbank | 113 meta_regex = regex_massbank |
| 108 elif args.schema == 'auto': | 114 elif args.schema == 'auto': |
| 109 # If auto we just check for all the available paramter names and then determine if Massbank or MSP based on | 115 # If auto we just check for all the available paramter names and then |
| 110 # the name parameter | 116 # determine if Massbank or MSP based on the name parameter |
| 111 meta_regex = {} | 117 meta_regex = {} |
| 112 meta_regex.update(regex_massbank) | 118 meta_regex.update(regex_massbank) |
| 113 meta_regex['name'].extend(regex_msp['name']) | 119 meta_regex['name'].extend(regex_msp['name']) |
| 114 meta_regex['polarity'].extend(regex_msp['polarity']) | 120 meta_regex['polarity'].extend(regex_msp['polarity']) |
| 115 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) | 121 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) |
| 116 meta_regex['precursor_type'].extend(regex_msp['precursor_type']) | 122 meta_regex['precursor_type'].extend(regex_msp['precursor_type']) |
| 117 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) | 123 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) |
| 118 meta_regex['msp'] = regex_msp['msp'] | 124 meta_regex['msp'] = regex_msp['msp'] |
| 119 | 125 else: |
| 120 | 126 sys.exit("No schema selected") |
| 121 | 127 |
| 122 adduct_types = { | 128 adduct_types = { |
| 123 '[M+H]+': 1.007276, | 129 '[M+H]+': 1.007276, |
| 124 '[M+NH4]+': 18.034374, | 130 '[M+NH4]+': 18.034374, |
| 125 '[M+Na]+': 22.989218, | 131 '[M+Na]+': 22.989218, |
| 129 '[M+ACN+Na]+': 64.015765, | 135 '[M+ACN+Na]+': 64.015765, |
| 130 '[M+2ACN+H]+': 83.06037, | 136 '[M+2ACN+H]+': 83.06037, |
| 131 '[M-H]-': -1.007276, | 137 '[M-H]-': -1.007276, |
| 132 '[M+Cl]-': 34.969402, | 138 '[M+Cl]-': 34.969402, |
| 133 '[M+HCOO]-': 44.99819, | 139 '[M+HCOO]-': 44.99819, |
| 134 '[M-H+HCOOH]-': 44.99819, # same as above but different style of writing adduct | 140 '[M-H+HCOOH]-': 44.99819, |
| 141 # same as above but different style of writing adduct | |
| 135 '[M+CH3COO]-': 59.01385, | 142 '[M+CH3COO]-': 59.01385, |
| 136 '[M-H+CH3COOH]-': 59.01385 # same as above but different style of writing adduct | 143 '[M-H+CH3COOH]-': 59.01385 |
| 144 # same as above but different style of writing adduct | |
| 137 } | 145 } |
| 138 inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()} | 146 inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()} |
| 139 | 147 |
| 148 | |
| 140 # function to extract the meta data using the regular expressions | 149 # function to extract the meta data using the regular expressions |
| 141 def parse_meta(meta_regex, meta_info={}): | 150 def parse_meta(meta_regex, meta_info=None): |
| 151 if meta_info is None: | |
| 152 meta_info = {} | |
| 142 for k, regexes in six.iteritems(meta_regex): | 153 for k, regexes in six.iteritems(meta_regex): |
| 143 for reg in regexes: | 154 for reg in regexes: |
| 144 m = re.search(reg, line, re.IGNORECASE) | 155 m = re.search(reg, line, re.IGNORECASE) |
| 145 if m: | 156 if m: |
| 146 meta_info[k] = '-'.join(m.groups()).strip() | 157 meta_info[k] = '-'.join(m.groups()).strip() |
| 147 return meta_info | 158 return meta_info |
| 148 | 159 |
| 149 | 160 |
| 150 | |
| 151 ###################################################################### | 161 ###################################################################### |
| 152 # Setup parameter dictionary | 162 # Setup parameter dictionary |
| 153 ###################################################################### | 163 ###################################################################### |
| 154 def init_paramd(args): | 164 def init_paramd(args): |
| 155 paramd = defaultdict() | 165 paramd = defaultdict() |
| 157 paramd["MetFragDatabaseType"] = args.MetFragDatabaseType | 167 paramd["MetFragDatabaseType"] = args.MetFragDatabaseType |
| 158 | 168 |
| 159 if args.MetFragDatabaseType == "LocalCSV": | 169 if args.MetFragDatabaseType == "LocalCSV": |
| 160 paramd["LocalDatabasePath"] = args.LocalDatabasePath | 170 paramd["LocalDatabasePath"] = args.LocalDatabasePath |
| 161 elif args.MetFragDatabaseType == "MetChem": | 171 elif args.MetFragDatabaseType == "MetChem": |
| 162 paramd["LocalMetChemDatabase"] = config.get('MetChem', 'LocalMetChemDatabase') | 172 paramd["LocalMetChemDatabase"] = \ |
| 163 paramd["LocalMetChemDatabasePortNumber"] = config.get('MetChem', 'LocalMetChemDatabasePortNumber') | 173 config.get('MetChem', 'LocalMetChemDatabase') |
| 164 paramd["LocalMetChemDatabaseServerIp"] = args.LocalMetChemDatabaseServerIp | 174 paramd["LocalMetChemDatabasePortNumber"] = \ |
| 165 paramd["LocalMetChemDatabaseUser"] = config.get('MetChem', 'LocalMetChemDatabaseUser') | 175 config.get('MetChem', 'LocalMetChemDatabasePortNumber') |
| 166 paramd["LocalMetChemDatabasePassword"] = config.get('MetChem', 'LocalMetChemDatabasePassword') | 176 paramd["LocalMetChemDatabaseServerIp"] = \ |
| 167 | 177 args.LocalMetChemDatabaseServerIp |
| 168 paramd["FragmentPeakMatchAbsoluteMassDeviation"] = args.FragmentPeakMatchAbsoluteMassDeviation | 178 paramd["LocalMetChemDatabaseUser"] = \ |
| 169 paramd["FragmentPeakMatchRelativeMassDeviation"] = args.FragmentPeakMatchRelativeMassDeviation | 179 config.get('MetChem', 'LocalMetChemDatabaseUser') |
| 170 paramd["DatabaseSearchRelativeMassDeviation"] = args.DatabaseSearchRelativeMassDeviation | 180 paramd["LocalMetChemDatabasePassword"] = \ |
| 181 config.get('MetChem', 'LocalMetChemDatabasePassword') | |
| 182 | |
| 183 paramd["FragmentPeakMatchAbsoluteMassDeviation"] = \ | |
| 184 args.FragmentPeakMatchAbsoluteMassDeviation | |
| 185 paramd["FragmentPeakMatchRelativeMassDeviation"] = \ | |
| 186 args.FragmentPeakMatchRelativeMassDeviation | |
| 187 paramd["DatabaseSearchRelativeMassDeviation"] = \ | |
| 188 args.DatabaseSearchRelativeMassDeviation | |
| 171 paramd["SampleName"] = '' | 189 paramd["SampleName"] = '' |
| 172 paramd["ResultsPath"] = os.path.join(wd) | 190 paramd["ResultsPath"] = os.path.join(wd) |
| 173 | 191 |
| 174 if args.polarity == "pos": | 192 if args.polarity == "pos": |
| 175 paramd["IsPositiveIonMode"] = True | 193 paramd["IsPositiveIonMode"] = True |
| 208 filterh.append('MaximumElementsFilter') | 226 filterh.append('MaximumElementsFilter') |
| 209 dct_filter['FilterMaximumElements'] = args.FilterMaximumElements | 227 dct_filter['FilterMaximumElements'] = args.FilterMaximumElements |
| 210 | 228 |
| 211 if args.FilterSmartsInclusionList: | 229 if args.FilterSmartsInclusionList: |
| 212 filterh.append('SmartsSubstructureInclusionFilter') | 230 filterh.append('SmartsSubstructureInclusionFilter') |
| 213 dct_filter['FilterSmartsInclusionList'] = args.FilterSmartsInclusionList | 231 dct_filter[ |
| 232 'FilterSmartsInclusionList'] = args.FilterSmartsInclusionList | |
| 214 | 233 |
| 215 if args.FilterSmartsExclusionList: | 234 if args.FilterSmartsExclusionList: |
| 216 filterh.append('SmartsSubstructureExclusionFilter') | 235 filterh.append('SmartsSubstructureExclusionFilter') |
| 217 dct_filter['FilterSmartsExclusionList'] = args.FilterSmartsExclusionList | 236 dct_filter[ |
| 218 | 237 'FilterSmartsExclusionList'] = args.FilterSmartsExclusionList |
| 219 # My understanding is that both 'ElementInclusionExclusiveFilter' and 'ElementExclusionFilter' use | 238 |
| 220 # 'FilterIncludedElements' | 239 # My understanding is that both 'ElementInclusionExclusiveFilter' |
| 240 # and 'ElementExclusionFilter' use 'FilterIncludedElements' | |
| 221 if args.FilterIncludedExclusiveElements: | 241 if args.FilterIncludedExclusiveElements: |
| 222 filterh.append('ElementInclusionExclusiveFilter') | 242 filterh.append('ElementInclusionExclusiveFilter') |
| 223 dct_filter['FilterIncludedElements'] = args.FilterIncludedExclusiveElements | 243 dct_filter[ |
| 244 'FilterIncludedElements'] = args.FilterIncludedExclusiveElements | |
| 224 | 245 |
| 225 if args.FilterIncludedElements: | 246 if args.FilterIncludedElements: |
| 226 filterh.append('ElementInclusionFilter') | 247 filterh.append('ElementInclusionFilter') |
| 227 dct_filter['FilterIncludedElements'] = args.FilterIncludedElements | 248 dct_filter['FilterIncludedElements'] = args.FilterIncludedElements |
| 228 | 249 |
| 242 | 263 |
| 243 ###################################################################### | 264 ###################################################################### |
| 244 # Function to run metfrag when all metainfo and peaks have been parsed | 265 # Function to run metfrag when all metainfo and peaks have been parsed |
| 245 ###################################################################### | 266 ###################################################################### |
| 246 def run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types): | 267 def run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types): |
| 247 # Get sample details (if possible to extract) e.g. if created as part of the msPurity pipeline) | 268 # Get sample details (if possible to extract) e.g. if created as part of |
| 248 # choose between getting additional details to add as columns as either all meta data from msp, just | 269 # the msPurity pipeline) choose between getting additional details to add |
| 249 # details from the record name (i.e. when using msPurity and we have the columns coded into the name) or | 270 # as columns as either all meta data from msp, just details from the |
| 250 # just the spectra index (spectrac)] | 271 # record name (i.e. when using msPurity and we have the columns coded into |
| 272 # the name) or just the spectra index (spectrac)]. | |
| 251 # Returns the parameters used and the command line call | 273 # Returns the parameters used and the command line call |
| 252 | 274 |
| 253 paramd = init_paramd(args) | 275 paramd = init_paramd(args) |
| 254 if args.meta_select_col == 'name': | 276 if args.meta_select_col == 'name': |
| 255 # have additional column of just the name | 277 # have additional column of just the name |
| 256 paramd['additional_details'] = {'name': meta_info['name']} | 278 paramd['additional_details'] = {'name': meta_info['name']} |
| 257 elif args.meta_select_col == 'name_split': | 279 elif args.meta_select_col == 'name_split': |
| 258 # have additional columns split by "|" and then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1 | 280 # have additional columns split by "|" and |
| 259 paramd['additional_details'] = {sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in | 281 # then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1 |
| 260 meta_info['name'].split("|")} | 282 paramd['additional_details'] = { |
| 283 sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in | |
| 284 meta_info['name'].split("|")} | |
| 261 elif args.meta_select_col == 'all': | 285 elif args.meta_select_col == 'all': |
| 262 # have additional columns based on all the meta information extracted from the MSP | 286 # have additional columns based on all the meta information |
| 287 # extracted from the MSP | |
| 263 paramd['additional_details'] = meta_info | 288 paramd['additional_details'] = meta_info |
| 264 else: | 289 else: |
| 265 # Just have and index of the spectra in the MSP file | 290 # Just have and index of the spectra in the MSP file |
| 266 paramd['additional_details'] = {'spectra_idx': spectrac} | 291 paramd['additional_details'] = {'spectra_idx': spectrac} |
| 267 | 292 |
| 268 | |
| 269 paramd["SampleName"] = "{}_metfrag_result".format(spectrac) | 293 paramd["SampleName"] = "{}_metfrag_result".format(spectrac) |
| 270 | 294 |
| 271 # =============== Output peaks to txt file ============================== | 295 # =============== Output peaks to txt file ============================== |
| 272 paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac)) | 296 paramd["PeakListPath"] = os.path.join(wd, |
| 297 "{}_tmpspec.txt".format(spectrac)) | |
| 273 | 298 |
| 274 # write spec file | 299 # write spec file |
| 275 with open(paramd["PeakListPath"], 'w') as outfile: | 300 with open(paramd["PeakListPath"], 'w') as outfile: |
| 276 for p in peaklist: | 301 for p in peaklist: |
| 277 outfile.write(p[0] + "\t" + p[1] + "\n") | 302 outfile.write(p[0] + "\t" + p[1] + "\n") |
| 278 | 303 |
| 279 # =============== Update param based on MSP metadata ====================== | 304 # =============== Update param based on MSP metadata ====================== |
| 280 # Replace param details with details from MSP if required | 305 # Replace param details with details from MSP if required |
| 281 if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types: | 306 if 'precursor_type' in meta_info and \ |
| 307 meta_info['precursor_type'] in adduct_types: | |
| 282 adduct = meta_info['precursor_type'] | 308 adduct = meta_info['precursor_type'] |
| 283 nm = float(meta_info['precursor_mz']) - adduct_types[meta_info['precursor_type']] | 309 nm = float(meta_info['precursor_mz']) - adduct_types[ |
| 284 paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0)) | 310 meta_info['precursor_type']] |
| 311 paramd["PrecursorIonMode"] = \ | |
| 312 int(round(adduct_types[meta_info['precursor_type']], 0)) | |
| 285 elif not args.skip_invalid_adducts: | 313 elif not args.skip_invalid_adducts: |
| 286 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] | 314 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] |
| 287 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] | 315 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] |
| 288 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] | 316 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] |
| 289 else: | 317 else: |
| 291 return '', '' | 319 return '', '' |
| 292 | 320 |
| 293 paramd['additional_details']['adduct'] = adduct | 321 paramd['additional_details']['adduct'] = adduct |
| 294 paramd["NeutralPrecursorMass"] = nm | 322 paramd["NeutralPrecursorMass"] = nm |
| 295 | 323 |
| 296 # =============== Create CLI cmd for metfrag =============================== | 324 # ============== Create CLI cmd for metfrag =============================== |
| 297 cmd = "metfrag" | 325 cmd = "metfrag" |
| 298 for k, v in six.iteritems(paramd): | 326 for k, v in six.iteritems(paramd): |
| 299 if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']: | 327 if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default', |
| 328 'additional_details']: | |
| 300 cmd += " {}={}".format(str(k), str(v)) | 329 cmd += " {}={}".format(str(k), str(v)) |
| 301 | 330 |
| 302 # =============== Run metfrag ============================================== | 331 # ============== Run metfrag ============================================== |
| 303 #print(cmd) | 332 # print(cmd) |
| 304 # Filter before process with a minimum number of MS/MS peaks | 333 # Filter before process with a minimum number of MS/MS peaks |
| 305 if plinesread >= float(args.minMSMSpeaks): | 334 if plinesread >= float(args.minMSMSpeaks): |
| 306 | 335 |
| 307 if int(args.cores_top_level) == 1: | 336 if int(args.cores_top_level) == 1: |
| 308 os.system(cmd) | 337 os.system(cmd) |
| 309 | 338 |
| 310 | |
| 311 | |
| 312 return paramd, cmd | 339 return paramd, cmd |
| 313 | 340 |
| 314 | 341 |
| 315 def work(cmds): | 342 def work(cmds): |
| 316 return [os.system(cmd) for cmd in cmds] | 343 return [os.system(cmd) for cmd in cmds] |
| 317 | |
| 318 | 344 |
| 319 | 345 |
| 320 ###################################################################### | 346 ###################################################################### |
| 321 # Parse MSP file and run metfrag CLI | 347 # Parse MSP file and run metfrag CLI |
| 322 ###################################################################### | 348 ###################################################################### |
| 336 plinesread = 0 | 362 plinesread = 0 |
| 337 for line in infile: | 363 for line in infile: |
| 338 line = line.strip() | 364 line = line.strip() |
| 339 | 365 |
| 340 if pnumlines == 0: | 366 if pnumlines == 0: |
| 341 # =============== Extract metadata from MSP ======================== | 367 # ============== Extract metadata from MSP ======================== |
| 342 meta_info = parse_meta(meta_regex, meta_info) | 368 meta_info = parse_meta(meta_regex, meta_info) |
| 343 | 369 |
| 344 if ('massbank' in meta_info and 'cols' in meta_info) or ('msp' in meta_info and 'num_peaks' in meta_info): | 370 if ('massbank' in meta_info and 'cols' in meta_info) or ( |
| 345 | 371 'msp' in meta_info and 'num_peaks' in meta_info): |
| 346 pnumlines = int(meta_info['num_peaks']) | 372 pnumlines = int(meta_info['num_peaks']) |
| 347 plinesread = 0 | 373 plinesread = 0 |
| 348 peaklist = [] | 374 peaklist = [] |
| 349 | 375 |
| 350 elif plinesread < pnumlines: | 376 elif plinesread < pnumlines: |
| 351 # =============== Extract peaks from MSP ========================== | 377 # ============== Extract peaks from MSP ========================== |
| 352 line = tuple(line.split()) # .split() will split on any empty space (i.e. tab and space) | 378 # .split() will split on any empty space (i.e. tab and space) |
| 379 line = tuple(line.split()) | |
| 353 # Keep only m/z and intensity, not relative intensity | 380 # Keep only m/z and intensity, not relative intensity |
| 354 save_line = tuple(line[0].split() + line[1].split()) | 381 save_line = tuple(line[0].split() + line[1].split()) |
| 355 plinesread += 1 | 382 plinesread += 1 |
| 356 peaklist.append(save_line) | 383 peaklist.append(save_line) |
| 357 | 384 |
| 358 elif plinesread and plinesread == pnumlines: | 385 elif plinesread and plinesread == pnumlines: |
| 359 # =============== Get sample name and additional details for output ======= | 386 # ======= Get sample name and additional details for output ======= |
| 360 spectrac += 1 | 387 spectrac += 1 |
| 361 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types) | 388 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, |
| 389 adduct_types) | |
| 362 | 390 |
| 363 if paramd: | 391 if paramd: |
| 364 paramds[paramd["SampleName"]] = paramd | 392 paramds[paramd["SampleName"]] = paramd |
| 365 cmds.append(cmd) | 393 cmds.append(cmd) |
| 366 | 394 |
| 369 plinesread = 0 | 397 plinesread = 0 |
| 370 | 398 |
| 371 # end of file. Check if there is a MSP spectra to run metfrag on still | 399 # end of file. Check if there is a MSP spectra to run metfrag on still |
| 372 if plinesread and plinesread == pnumlines: | 400 if plinesread and plinesread == pnumlines: |
| 373 | 401 |
| 374 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac+1, adduct_types) | 402 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac + 1, |
| 403 adduct_types) | |
| 375 | 404 |
| 376 if paramd: | 405 if paramd: |
| 377 paramds[paramd["SampleName"]] = paramd | 406 paramds[paramd["SampleName"]] = paramd |
| 378 cmds.append(cmd) | 407 cmds.append(cmd) |
| 379 | 408 |
| 380 | |
| 381 | |
| 382 | |
| 383 # Perform multiprocessing on command line call level | 409 # Perform multiprocessing on command line call level |
| 384 if int(args.cores_top_level) > 1: | 410 if int(args.cores_top_level) > 1: |
| 385 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in list(range(0, len(cmds), int(args.chunks)))] | 411 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in |
| 412 list(range(0, len(cmds), int(args.chunks)))] | |
| 386 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) | 413 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) |
| 387 pool.map(work, cmds_chunks) | 414 pool.map(work, cmds_chunks) |
| 388 pool.close() | 415 pool.close() |
| 389 pool.join() | 416 pool.join() |
| 390 | 417 |
| 391 ###################################################################### | 418 ###################################################################### |
| 392 # Concatenate and filter the output | 419 # Concatenate and filter the output |
| 393 ###################################################################### | 420 ###################################################################### |
| 394 # outputs might have different headers. Need to get a list of all the headers before we start merging the files | 421 # outputs might have different headers. Need to get a list of all the |
| 395 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, "*_metfrag_result.csv"))] | 422 # headers before we start merging the files |
| 423 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, | |
| 424 # "*_metfrag_result.csv"))] | |
| 396 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) | 425 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) |
| 397 | 426 |
| 398 if len(outfiles) == 0: | 427 if len(outfiles) == 0: |
| 399 print('No results') | 428 print('No results') |
| 400 sys.exit() | 429 sys.exit() |
| 406 reader = csv.reader(infile) | 435 reader = csv.reader(infile) |
| 407 if sys.version_info >= (3, 0): | 436 if sys.version_info >= (3, 0): |
| 408 headers.extend(next(reader)) | 437 headers.extend(next(reader)) |
| 409 else: | 438 else: |
| 410 headers.extend(reader.next()) | 439 headers.extend(reader.next()) |
| 411 # check if file has any data rows | 440 # check if file has any data rows |
| 412 for i, row in enumerate(reader): | 441 for i, row in enumerate(reader): |
| 413 c += 1 | 442 c += 1 |
| 414 if i == 1: | 443 if i == 1: |
| 415 break | 444 break |
| 416 | 445 |
| 417 # if no data rows (e.g. matches) then do not save an output and leave the program | 446 # if no data rows (e.g. matches) then do not save an |
| 447 # output and leave the program | |
| 418 if c == 0: | 448 if c == 0: |
| 419 print('No results') | 449 print('No results') |
| 420 sys.exit() | 450 sys.exit() |
| 421 | 451 |
| 422 additional_detail_headers = ['sample_name'] | 452 additional_detail_headers = ['sample_name'] |
| 423 for k, paramd in six.iteritems(paramds): | 453 for k, paramd in six.iteritems(paramds): |
| 424 additional_detail_headers = list(set(additional_detail_headers + list(paramd['additional_details'].keys()))) | 454 additional_detail_headers = list(set( |
| 455 additional_detail_headers + list(paramd['additional_details'].keys()))) | |
| 425 | 456 |
| 426 # add inchikey if not already present (missing in metchem output) | 457 # add inchikey if not already present (missing in metchem output) |
| 427 if 'InChIKey' not in headers: | 458 if 'InChIKey' not in headers: |
| 428 headers.append('InChIKey') | 459 headers.append('InChIKey') |
| 429 | 460 |
| 430 headers = additional_detail_headers + sorted(list(set(headers))) | 461 headers = additional_detail_headers + sorted(list(set(headers))) |
| 431 | 462 |
| 432 | |
| 433 | |
| 434 # Sort files nicely | 463 # Sort files nicely |
| 435 outfiles.sort(key = lambda s: int(re.match('^.*/(\d+)_metfrag_result.csv', s).group(1))) | 464 outfiles.sort( |
| 465 key=lambda s: int(re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1))) | |
| 436 | 466 |
| 437 print(outfiles) | 467 print(outfiles) |
| 438 | |
| 439 | 468 |
| 440 # merge outputs | 469 # merge outputs |
| 441 with open(args.result_pth, 'a') as merged_outfile: | 470 with open(args.result_pth, 'a') as merged_outfile: |
| 442 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, delimiter='\t', quotechar='"') | 471 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, |
| 472 delimiter='\t', quotechar='"') | |
| 443 dwriter.writeheader() | 473 dwriter.writeheader() |
| 444 | 474 |
| 445 for fn in outfiles: | 475 for fn in outfiles: |
| 446 | 476 |
| 447 with open(fn) as infile: | 477 with open(fn) as infile: |
| 449 for line in reader: | 479 for line in reader: |
| 450 bewrite = True | 480 bewrite = True |
| 451 for key, value in line.items(): | 481 for key, value in line.items(): |
| 452 # Filter when no MS/MS peak matched | 482 # Filter when no MS/MS peak matched |
| 453 if key == "ExplPeaks": | 483 if key == "ExplPeaks": |
| 454 if float(args.pctexplpeak_thrshld) > 0 and "NA" in value: | 484 if float(args.pctexplpeak_thrshld) > 0 and \ |
| 485 "NA" in value: | |
| 455 bewrite = False | 486 bewrite = False |
| 456 # Filter with a score threshold | 487 # Filter with a score threshold |
| 457 elif key == "Score": | 488 elif key == "Score": |
| 458 if float(value) <= float(args.score_thrshld): | 489 if float(value) <= float(args.score_thrshld): |
| 459 bewrite = False | 490 bewrite = False |
| 475 bfn = os.path.basename(fn) | 506 bfn = os.path.basename(fn) |
| 476 bfn = bfn.replace(".csv", "") | 507 bfn = bfn.replace(".csv", "") |
| 477 line['sample_name'] = paramds[bfn]['SampleName'] | 508 line['sample_name'] = paramds[bfn]['SampleName'] |
| 478 ad = paramds[bfn]['additional_details'] | 509 ad = paramds[bfn]['additional_details'] |
| 479 | 510 |
| 480 if args.MetFragDatabaseType == "MetChem": | 511 if args.MetFragDatabaseType == "MetChem": |
| 481 # for some reason the metchem database option does not report the full inchikey (at least | 512 # for some reason the metchem database option does |
| 482 # in the Bham setup. This ensures we always get the fully inchikey | 513 # not report the full inchikey (at least in the Bham |
| 483 line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'], line['InChIKey2'], line['InChIKey3']) | 514 # setup. This ensures we always get the fully inchikey |
| 515 line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'], | |
| 516 line['InChIKey2'], | |
| 517 line['InChIKey3']) | |
| 484 | 518 |
| 485 line.update(ad) | 519 line.update(ad) |
| 486 dwriter.writerow(line) | 520 dwriter.writerow(line) |
