Mercurial > repos > tomnl > metfrag
diff metfrag.py @ 9:5763234618d4 draft default tip
"planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit 6bc059c100f1f968d99c1d5ad5e7259e83d386b6"
| author | tomnl |
|---|---|
| date | Fri, 04 Oct 2019 07:15:52 -0400 |
| parents | 9a3019c609d9 |
| children |
line wrap: on
line diff
--- a/metfrag.py Fri Sep 13 08:27:59 2019 -0400 +++ b/metfrag.py Fri Oct 04 07:15:52 2019 -0400 @@ -1,18 +1,18 @@ from __future__ import absolute_import, print_function + +import ConfigParser import argparse import csv +import glob +import multiprocessing import os -import sys -import six import re -import ConfigParser import shutil -import glob +import sys import tempfile -import multiprocessing +from collections import defaultdict -from subprocess import call -from collections import defaultdict +import six print(sys.version) @@ -49,26 +49,24 @@ parser.add_argument('--cores_top_level', default=1) parser.add_argument('--chunks', default=1) parser.add_argument('--meta_select_col', default='name') -parser.add_argument('--skip_invalid_adducts', action='store_true') +parser.add_argument('--skip_invalid_adducts', action='store_true') parser.add_argument('--ScoreSuspectLists', default='') -parser.add_argument('--MetFragScoreTypes', default="FragmenterScore,OfflineMetFusionScore") +parser.add_argument('--MetFragScoreTypes', + default="FragmenterScore,OfflineMetFusionScore") parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") - args = parser.parse_args() print(args) - config = ConfigParser.ConfigParser() -config.read(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) - +config.read( + os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) if os.stat(args.input_pth).st_size == 0: print('Input file empty') exit() - # Create temporary working directory if args.temp_dir: wd = args.temp_dir @@ -85,29 +83,37 @@ # Setup regular expressions for MSP parsing dictionary ###################################################################### regex_msp = {} -regex_msp['name'] = ['^Name(?:=|:)(.*)$'] -regex_msp['polarity'] = ['^ion.*mode(?:=|:)(.*)$', '^ionization.*mode(?:=|:)(.*)$', '^polarity(?:=|:)(.*)$'] -regex_msp['precursor_mz'] = ['^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', '^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] -regex_msp['precursor_type'] = ['^precursor.*type(?:=|:)(.*)$', '^adduct(?:=|:)(.*)$', '^ADDUCTIONNAME(?:=|:)(.*)$'] -regex_msp['num_peaks'] = ['^Num.*Peaks(?:=|:)\s*(\d*)$'] -regex_msp['msp'] = ['^Name(?:=|:)(.*)$'] # Flag for standard MSP format +regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] +regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', + r'^ionization.*mode(?:=|:)(.*)$', + r'^polarity(?:=|:)(.*)$'] +regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', + r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] +regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', + r'^adduct(?:=|:)(.*)$', + r'^ADDUCTIONNAME(?:=|:)(.*)$'] +regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] +regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format regex_massbank = {} -regex_massbank['name'] = ['^RECORD_TITLE:(.*)$'] -regex_massbank['polarity'] = ['^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] -regex_massbank['precursor_mz'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] -regex_massbank['precursor_type'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] -regex_massbank['num_peaks'] = ['^PK\$NUM_PEAK:\s+(\d*)'] -regex_massbank['cols'] = ['^PK\$PEAK:\s+(.*)'] -regex_massbank['massbank'] = ['^RECORD_TITLE:(.*)$'] # Flag for massbank format +regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] +regex_massbank['polarity'] = [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] +regex_massbank['precursor_mz'] = [ + r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] +regex_massbank['precursor_type'] = [ + r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] +regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] +regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] +regex_massbank['massbank'] = [ + r'^RECORD_TITLE:(.*)$'] # Flag for massbank format if args.schema == 'msp': meta_regex = regex_msp elif args.schema == 'massbank': meta_regex = regex_massbank elif args.schema == 'auto': - # If auto we just check for all the available paramter names and then determine if Massbank or MSP based on - # the name parameter + # If auto we just check for all the available paramter names and then + # determine if Massbank or MSP based on the name parameter meta_regex = {} meta_regex.update(regex_massbank) meta_regex['name'].extend(regex_msp['name']) @@ -116,8 +122,8 @@ meta_regex['precursor_type'].extend(regex_msp['precursor_type']) meta_regex['num_peaks'].extend(regex_msp['num_peaks']) meta_regex['msp'] = regex_msp['msp'] - - +else: + sys.exit("No schema selected") adduct_types = { '[M+H]+': 1.007276, @@ -131,14 +137,19 @@ '[M-H]-': -1.007276, '[M+Cl]-': 34.969402, '[M+HCOO]-': 44.99819, - '[M-H+HCOOH]-': 44.99819, # same as above but different style of writing adduct + '[M-H+HCOOH]-': 44.99819, + # same as above but different style of writing adduct '[M+CH3COO]-': 59.01385, - '[M-H+CH3COOH]-': 59.01385 # same as above but different style of writing adduct + '[M-H+CH3COOH]-': 59.01385 + # same as above but different style of writing adduct } inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()} + # function to extract the meta data using the regular expressions -def parse_meta(meta_regex, meta_info={}): +def parse_meta(meta_regex, meta_info=None): + if meta_info is None: + meta_info = {} for k, regexes in six.iteritems(meta_regex): for reg in regexes: m = re.search(reg, line, re.IGNORECASE) @@ -147,7 +158,6 @@ return meta_info - ###################################################################### # Setup parameter dictionary ###################################################################### @@ -159,15 +169,23 @@ if args.MetFragDatabaseType == "LocalCSV": paramd["LocalDatabasePath"] = args.LocalDatabasePath elif args.MetFragDatabaseType == "MetChem": - paramd["LocalMetChemDatabase"] = config.get('MetChem', 'LocalMetChemDatabase') - paramd["LocalMetChemDatabasePortNumber"] = config.get('MetChem', 'LocalMetChemDatabasePortNumber') - paramd["LocalMetChemDatabaseServerIp"] = args.LocalMetChemDatabaseServerIp - paramd["LocalMetChemDatabaseUser"] = config.get('MetChem', 'LocalMetChemDatabaseUser') - paramd["LocalMetChemDatabasePassword"] = config.get('MetChem', 'LocalMetChemDatabasePassword') + paramd["LocalMetChemDatabase"] = \ + config.get('MetChem', 'LocalMetChemDatabase') + paramd["LocalMetChemDatabasePortNumber"] = \ + config.get('MetChem', 'LocalMetChemDatabasePortNumber') + paramd["LocalMetChemDatabaseServerIp"] = \ + args.LocalMetChemDatabaseServerIp + paramd["LocalMetChemDatabaseUser"] = \ + config.get('MetChem', 'LocalMetChemDatabaseUser') + paramd["LocalMetChemDatabasePassword"] = \ + config.get('MetChem', 'LocalMetChemDatabasePassword') - paramd["FragmentPeakMatchAbsoluteMassDeviation"] = args.FragmentPeakMatchAbsoluteMassDeviation - paramd["FragmentPeakMatchRelativeMassDeviation"] = args.FragmentPeakMatchRelativeMassDeviation - paramd["DatabaseSearchRelativeMassDeviation"] = args.DatabaseSearchRelativeMassDeviation + paramd["FragmentPeakMatchAbsoluteMassDeviation"] = \ + args.FragmentPeakMatchAbsoluteMassDeviation + paramd["FragmentPeakMatchRelativeMassDeviation"] = \ + args.FragmentPeakMatchRelativeMassDeviation + paramd["DatabaseSearchRelativeMassDeviation"] = \ + args.DatabaseSearchRelativeMassDeviation paramd["SampleName"] = '' paramd["ResultsPath"] = os.path.join(wd) @@ -210,17 +228,20 @@ if args.FilterSmartsInclusionList: filterh.append('SmartsSubstructureInclusionFilter') - dct_filter['FilterSmartsInclusionList'] = args.FilterSmartsInclusionList + dct_filter[ + 'FilterSmartsInclusionList'] = args.FilterSmartsInclusionList if args.FilterSmartsExclusionList: filterh.append('SmartsSubstructureExclusionFilter') - dct_filter['FilterSmartsExclusionList'] = args.FilterSmartsExclusionList + dct_filter[ + 'FilterSmartsExclusionList'] = args.FilterSmartsExclusionList - # My understanding is that both 'ElementInclusionExclusiveFilter' and 'ElementExclusionFilter' use - # 'FilterIncludedElements' + # My understanding is that both 'ElementInclusionExclusiveFilter' + # and 'ElementExclusionFilter' use 'FilterIncludedElements' if args.FilterIncludedExclusiveElements: filterh.append('ElementInclusionExclusiveFilter') - dct_filter['FilterIncludedElements'] = args.FilterIncludedExclusiveElements + dct_filter[ + 'FilterIncludedElements'] = args.FilterIncludedExclusiveElements if args.FilterIncludedElements: filterh.append('ElementInclusionFilter') @@ -244,10 +265,11 @@ # Function to run metfrag when all metainfo and peaks have been parsed ###################################################################### def run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types): - # Get sample details (if possible to extract) e.g. if created as part of the msPurity pipeline) - # choose between getting additional details to add as columns as either all meta data from msp, just - # details from the record name (i.e. when using msPurity and we have the columns coded into the name) or - # just the spectra index (spectrac)] + # Get sample details (if possible to extract) e.g. if created as part of + # the msPurity pipeline) choose between getting additional details to add + # as columns as either all meta data from msp, just details from the + # record name (i.e. when using msPurity and we have the columns coded into + # the name) or just the spectra index (spectrac)]. # Returns the parameters used and the command line call paramd = init_paramd(args) @@ -255,21 +277,24 @@ # have additional column of just the name paramd['additional_details'] = {'name': meta_info['name']} elif args.meta_select_col == 'name_split': - # have additional columns split by "|" and then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1 - paramd['additional_details'] = {sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in - meta_info['name'].split("|")} + # have additional columns split by "|" and + # then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1 + paramd['additional_details'] = { + sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in + meta_info['name'].split("|")} elif args.meta_select_col == 'all': - # have additional columns based on all the meta information extracted from the MSP + # have additional columns based on all the meta information + # extracted from the MSP paramd['additional_details'] = meta_info else: # Just have and index of the spectra in the MSP file paramd['additional_details'] = {'spectra_idx': spectrac} - paramd["SampleName"] = "{}_metfrag_result".format(spectrac) # =============== Output peaks to txt file ============================== - paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac)) + paramd["PeakListPath"] = os.path.join(wd, + "{}_tmpspec.txt".format(spectrac)) # write spec file with open(paramd["PeakListPath"], 'w') as outfile: @@ -278,10 +303,13 @@ # =============== Update param based on MSP metadata ====================== # Replace param details with details from MSP if required - if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types: + if 'precursor_type' in meta_info and \ + meta_info['precursor_type'] in adduct_types: adduct = meta_info['precursor_type'] - nm = float(meta_info['precursor_mz']) - adduct_types[meta_info['precursor_type']] - paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0)) + nm = float(meta_info['precursor_mz']) - adduct_types[ + meta_info['precursor_type']] + paramd["PrecursorIonMode"] = \ + int(round(adduct_types[meta_info['precursor_type']], 0)) elif not args.skip_invalid_adducts: adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] @@ -293,22 +321,21 @@ paramd['additional_details']['adduct'] = adduct paramd["NeutralPrecursorMass"] = nm - # =============== Create CLI cmd for metfrag =============================== + # ============== Create CLI cmd for metfrag =============================== cmd = "metfrag" for k, v in six.iteritems(paramd): - if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']: + if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default', + 'additional_details']: cmd += " {}={}".format(str(k), str(v)) - # =============== Run metfrag ============================================== - #print(cmd) + # ============== Run metfrag ============================================== + # print(cmd) # Filter before process with a minimum number of MS/MS peaks if plinesread >= float(args.minMSMSpeaks): if int(args.cores_top_level) == 1: os.system(cmd) - - return paramd, cmd @@ -316,7 +343,6 @@ return [os.system(cmd) for cmd in cmds] - ###################################################################### # Parse MSP file and run metfrag CLI ###################################################################### @@ -338,27 +364,29 @@ line = line.strip() if pnumlines == 0: - # =============== Extract metadata from MSP ======================== + # ============== Extract metadata from MSP ======================== meta_info = parse_meta(meta_regex, meta_info) - if ('massbank' in meta_info and 'cols' in meta_info) or ('msp' in meta_info and 'num_peaks' in meta_info): - + if ('massbank' in meta_info and 'cols' in meta_info) or ( + 'msp' in meta_info and 'num_peaks' in meta_info): pnumlines = int(meta_info['num_peaks']) plinesread = 0 peaklist = [] elif plinesread < pnumlines: - # =============== Extract peaks from MSP ========================== - line = tuple(line.split()) # .split() will split on any empty space (i.e. tab and space) + # ============== Extract peaks from MSP ========================== + # .split() will split on any empty space (i.e. tab and space) + line = tuple(line.split()) # Keep only m/z and intensity, not relative intensity save_line = tuple(line[0].split() + line[1].split()) plinesread += 1 peaklist.append(save_line) elif plinesread and plinesread == pnumlines: - # =============== Get sample name and additional details for output ======= + # ======= Get sample name and additional details for output ======= spectrac += 1 - paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types) + paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, + adduct_types) if paramd: paramds[paramd["SampleName"]] = paramd @@ -371,18 +399,17 @@ # end of file. Check if there is a MSP spectra to run metfrag on still if plinesread and plinesread == pnumlines: - paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac+1, adduct_types) + paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac + 1, + adduct_types) if paramd: paramds[paramd["SampleName"]] = paramd cmds.append(cmd) - - - # Perform multiprocessing on command line call level if int(args.cores_top_level) > 1: - cmds_chunks = [cmds[x:x + int(args.chunks)] for x in list(range(0, len(cmds), int(args.chunks)))] + cmds_chunks = [cmds[x:x + int(args.chunks)] for x in + list(range(0, len(cmds), int(args.chunks)))] pool = multiprocessing.Pool(processes=int(args.cores_top_level)) pool.map(work, cmds_chunks) pool.close() @@ -391,8 +418,10 @@ ###################################################################### # Concatenate and filter the output ###################################################################### -# outputs might have different headers. Need to get a list of all the headers before we start merging the files -# outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, "*_metfrag_result.csv"))] +# outputs might have different headers. Need to get a list of all the +# headers before we start merging the files +# outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, +# "*_metfrag_result.csv"))] outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) if len(outfiles) == 0: @@ -408,20 +437,22 @@ headers.extend(next(reader)) else: headers.extend(reader.next()) - # check if file has any data rows + # check if file has any data rows for i, row in enumerate(reader): c += 1 if i == 1: break -# if no data rows (e.g. matches) then do not save an output and leave the program +# if no data rows (e.g. matches) then do not save an +# output and leave the program if c == 0: print('No results') sys.exit() additional_detail_headers = ['sample_name'] for k, paramd in six.iteritems(paramds): - additional_detail_headers = list(set(additional_detail_headers + list(paramd['additional_details'].keys()))) + additional_detail_headers = list(set( + additional_detail_headers + list(paramd['additional_details'].keys()))) # add inchikey if not already present (missing in metchem output) if 'InChIKey' not in headers: @@ -429,17 +460,16 @@ headers = additional_detail_headers + sorted(list(set(headers))) - - # Sort files nicely -outfiles.sort(key = lambda s: int(re.match('^.*/(\d+)_metfrag_result.csv', s).group(1))) +outfiles.sort( + key=lambda s: int(re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1))) print(outfiles) - # merge outputs with open(args.result_pth, 'a') as merged_outfile: - dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, delimiter='\t', quotechar='"') + dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, + delimiter='\t', quotechar='"') dwriter.writeheader() for fn in outfiles: @@ -451,7 +481,8 @@ for key, value in line.items(): # Filter when no MS/MS peak matched if key == "ExplPeaks": - if float(args.pctexplpeak_thrshld) > 0 and "NA" in value: + if float(args.pctexplpeak_thrshld) > 0 and \ + "NA" in value: bewrite = False # Filter with a score threshold elif key == "Score": @@ -477,10 +508,13 @@ line['sample_name'] = paramds[bfn]['SampleName'] ad = paramds[bfn]['additional_details'] - if args.MetFragDatabaseType == "MetChem": - # for some reason the metchem database option does not report the full inchikey (at least - # in the Bham setup. This ensures we always get the fully inchikey - line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'], line['InChIKey2'], line['InChIKey3']) + if args.MetFragDatabaseType == "MetChem": + # for some reason the metchem database option does + # not report the full inchikey (at least in the Bham + # setup. This ensures we always get the fully inchikey + line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'], + line['InChIKey2'], + line['InChIKey3']) line.update(ad) dwriter.writerow(line)
