Mercurial > repos > tomnl > metfrag
diff metfrag.py @ 2:d040e27b6225 draft
planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit 59631850110e60900d03f642c445cc35a36414b5
| author | tomnl |
|---|---|
| date | Mon, 03 Jun 2019 11:20:55 -0400 |
| parents | c1b168770b68 |
| children | 5ee936e570a7 |
line wrap: on
line diff
--- a/metfrag.py Fri May 31 04:41:09 2019 -0400 +++ b/metfrag.py Mon Jun 03 11:20:55 2019 -0400 @@ -71,96 +71,6 @@ os.makedirs(wd) ###################################################################### -# Setup parameter dictionary -###################################################################### -paramd = defaultdict() - -paramd["MetFragDatabaseType"] = args.MetFragDatabaseType - -if args.MetFragDatabaseType == "LocalCSV": - paramd["LocalDatabasePath"] = args.LocalDatabasePath -elif args.MetFragDatabaseType == "MetChem": - paramd["LocalMetChemDatabase"] = "metchem" - paramd["LocalMetChemDatabasePortNumber"] = 5432 - paramd["LocalMetChemDatabaseServerIp"] = args.LocalMetChemDatabaseServerIp - paramd["LocalMetChemDatabaseUser"] = "metchemro" - paramd["LocalMetChemDatabasePassword"] = "metchemro" - -paramd["FragmentPeakMatchAbsoluteMassDeviation"] = args.FragmentPeakMatchAbsoluteMassDeviation -paramd["FragmentPeakMatchRelativeMassDeviation"] = args.FragmentPeakMatchRelativeMassDeviation -paramd["DatabaseSearchRelativeMassDeviation"] = args.DatabaseSearchRelativeMassDeviation -paramd["SampleName"] = '' -paramd["ResultsPath"] = os.path.join(wd) - -if args.polarity == "pos": - paramd["IsPositiveIonMode"] = True - paramd["PrecursorIonModeDefault"] = "1" - paramd["PrecursorIonMode"] = "1" - paramd["nm_mass_diff_default"] = 1.007276 -else: - paramd["IsPositiveIonMode"] = False - paramd["PrecursorIonModeDefault"] = "-1" - paramd["PrecursorIonMode"] = "-1" - paramd["nm_mass_diff_default"] = -1.007276 - -paramd["MetFragCandidateWriter"] = "CSV" -paramd["NumberThreads"] = args.NumberThreads - -if args.ScoreSuspectLists: - paramd["ScoreSuspectLists"] = args.ScoreSuspectLists - -paramd["MetFragScoreTypes"] = args.MetFragScoreTypes -paramd["MetFragScoreWeights"] = args.MetFragScoreWeights - -dct_filter = defaultdict() -filterh = [] - -if args.UnconnectedCompoundFilter: - filterh.append('UnconnectedCompoundFilter') - -if args.IsotopeFilter: - filterh.append('IsotopeFilter') - -if args.FilterMinimumElements: - filterh.append('MinimumElementsFilter') - dct_filter['FilterMinimumElements'] = args.FilterMinimumElements - -if args.FilterMaximumElements: - filterh.append('MaximumElementsFilter') - dct_filter['FilterMaximumElements'] = args.FilterMaximumElements - -if args.FilterSmartsInclusionList: - filterh.append('SmartsSubstructureInclusionFilter') - dct_filter['FilterSmartsInclusionList'] = args.FilterSmartsInclusionList - -if args.FilterSmartsExclusionList: - filterh.append('SmartsSubstructureExclusionFilter') - dct_filter['FilterSmartsExclusionList'] = args.FilterSmartsExclusionList - -# My understanding is that both 'ElementInclusionExclusiveFilter' and 'ElementExclusionFilter' use -# 'FilterIncludedElements' -if args.FilterIncludedExclusiveElements: - filterh.append('ElementInclusionExclusiveFilter') - dct_filter['FilterIncludedElements'] = args.FilterIncludedExclusiveElements - -if args.FilterIncludedElements: - filterh.append('ElementInclusionFilter') - dct_filter['FilterIncludedElements'] = args.FilterIncludedElements - -if args.FilterExcludedElements: - filterh.append('ElementExclusionFilter') - dct_filter['FilterExcludedElements'] = args.FilterExcludedElements - -if filterh: - fcmds = ','.join(filterh) + ' ' - for k, v in six.iteritems(dct_filter): - fcmds += "{0}={1} ".format(str(k), str(v)) - - paramd["MetFragPreProcessingCandidateFilter"] = fcmds - -print(paramd) - -###################################################################### # Setup regular expressions for MSP parsing dictionary ###################################################################### regex_msp = {} @@ -198,22 +108,6 @@ print(meta_regex) - - -# this dictionary will store the meta data results form the MSp file -meta_info = {} - - -# function to extract the meta data using the regular expressions -def parse_meta(meta_regex, meta_info={}): - for k, regexes in six.iteritems(meta_regex): - for reg in regexes: - m = re.search(reg, line, re.IGNORECASE) - if m: - meta_info[k] = '-'.join(m.groups()).strip() - return meta_info - - adduct_types = { '[M+H]+': 1.007276, '[M+NH4]+': 18.034374, @@ -227,6 +121,178 @@ '[M+Cl]-': 34.969402, } +# function to extract the meta data using the regular expressions +def parse_meta(meta_regex, meta_info={}): + for k, regexes in six.iteritems(meta_regex): + for reg in regexes: + m = re.search(reg, line, re.IGNORECASE) + if m: + meta_info[k] = '-'.join(m.groups()).strip() + return meta_info + + + +###################################################################### +# Setup parameter dictionary +###################################################################### +def init_paramd(args): + paramd = defaultdict() + + paramd["MetFragDatabaseType"] = args.MetFragDatabaseType + + if args.MetFragDatabaseType == "LocalCSV": + paramd["LocalDatabasePath"] = args.LocalDatabasePath + elif args.MetFragDatabaseType == "MetChem": + paramd["LocalMetChemDatabase"] = "metchem" + paramd["LocalMetChemDatabasePortNumber"] = 5432 + paramd["LocalMetChemDatabaseServerIp"] = args.LocalMetChemDatabaseServerIp + paramd["LocalMetChemDatabaseUser"] = "metchemro" + paramd["LocalMetChemDatabasePassword"] = "metchemro" + + paramd["FragmentPeakMatchAbsoluteMassDeviation"] = args.FragmentPeakMatchAbsoluteMassDeviation + paramd["FragmentPeakMatchRelativeMassDeviation"] = args.FragmentPeakMatchRelativeMassDeviation + paramd["DatabaseSearchRelativeMassDeviation"] = args.DatabaseSearchRelativeMassDeviation + paramd["SampleName"] = '' + paramd["ResultsPath"] = os.path.join(wd) + + if args.polarity == "pos": + paramd["IsPositiveIonMode"] = True + paramd["PrecursorIonModeDefault"] = "1" + paramd["PrecursorIonMode"] = "1" + paramd["nm_mass_diff_default"] = 1.007276 + else: + paramd["IsPositiveIonMode"] = False + paramd["PrecursorIonModeDefault"] = "-1" + paramd["PrecursorIonMode"] = "-1" + paramd["nm_mass_diff_default"] = -1.007276 + + paramd["MetFragCandidateWriter"] = "CSV" + paramd["NumberThreads"] = args.NumberThreads + + if args.ScoreSuspectLists: + paramd["ScoreSuspectLists"] = args.ScoreSuspectLists + + paramd["MetFragScoreTypes"] = args.MetFragScoreTypes + paramd["MetFragScoreWeights"] = args.MetFragScoreWeights + + dct_filter = defaultdict() + filterh = [] + + if args.UnconnectedCompoundFilter: + filterh.append('UnconnectedCompoundFilter') + + if args.IsotopeFilter: + filterh.append('IsotopeFilter') + + if args.FilterMinimumElements: + filterh.append('MinimumElementsFilter') + dct_filter['FilterMinimumElements'] = args.FilterMinimumElements + + if args.FilterMaximumElements: + filterh.append('MaximumElementsFilter') + dct_filter['FilterMaximumElements'] = args.FilterMaximumElements + + if args.FilterSmartsInclusionList: + filterh.append('SmartsSubstructureInclusionFilter') + dct_filter['FilterSmartsInclusionList'] = args.FilterSmartsInclusionList + + if args.FilterSmartsExclusionList: + filterh.append('SmartsSubstructureExclusionFilter') + dct_filter['FilterSmartsExclusionList'] = args.FilterSmartsExclusionList + + # My understanding is that both 'ElementInclusionExclusiveFilter' and 'ElementExclusionFilter' use + # 'FilterIncludedElements' + if args.FilterIncludedExclusiveElements: + filterh.append('ElementInclusionExclusiveFilter') + dct_filter['FilterIncludedElements'] = args.FilterIncludedExclusiveElements + + if args.FilterIncludedElements: + filterh.append('ElementInclusionFilter') + dct_filter['FilterIncludedElements'] = args.FilterIncludedElements + + if args.FilterExcludedElements: + filterh.append('ElementExclusionFilter') + dct_filter['FilterExcludedElements'] = args.FilterExcludedElements + + if filterh: + fcmds = ','.join(filterh) + ' ' + for k, v in six.iteritems(dct_filter): + fcmds += "{0}={1} ".format(str(k), str(v)) + + paramd["MetFragPreProcessingCandidateFilter"] = fcmds + + return paramd + + +###################################################################### +# Function to run metfrag when all metainfo and peaks have been parsed +###################################################################### +def run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types): + # Get sample details (if possible to extract) e.g. if created as part of the msPurity pipeline) + # choose between getting additional details to add as columns as either all meta data from msp, just + # details from the record name (i.e. when using msPurity and we have the columns coded into the name) or + # just the spectra index (spectrac)] + # Returns the parameters used and the command line call + + paramd = init_paramd(args) + if args.meta_select_col == 'name': + # have additional column of just the name + paramd['additional_details'] = {'name': meta_info['name']} + elif args.meta_select_col == 'name_split': + # have additional columns split by "|" and then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1 + paramd['additional_details'] = {sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in + meta_info['name'].split("|")} + elif args.meta_select_col == 'all': + # have additional columns based on all the meta information extracted from the MSP + paramd['additional_details'] = meta_info + else: + # Just have and index of the spectra in the MSP file + paramd['additional_details'] = {'spectra_idx': spectrac} + + paramd["SampleName"] = "{}_metfrag_result".format(spectrac) + + # =============== Output peaks to txt file ============================== + paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac)) + + # write spec file + with open(paramd["PeakListPath"], 'w') as outfile: + for p in peaklist: + outfile.write(p[0] + "\t" + p[1] + "\n") + + # =============== Update param based on MSP metadata ====================== + # Replace param details with details from MSP if required + if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types: + + nm = float(meta_info['precursor_mz']) + adduct_types[meta_info['precursor_type']] + paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0)) + else: + + paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] + nm = float(meta_info['precursor_mz']) + paramd['nm_mass_diff_default'] + + paramd["NeutralPrecursorMass"] = nm + + # =============== Create CLI cmd for metfrag =============================== + cmd = "metfrag" + for k, v in six.iteritems(paramd): + if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']: + cmd += " {}={}".format(str(k), str(v)) + + # =============== Run metfrag ============================================== + # Filter before process with a minimum number of MS/MS peaks + if plinesread >= float(args.minMSMSpeaks): + + if int(args.cores_top_level) == 1: + os.system(cmd) + + return paramd, cmd + + +def work(cmds): + return [os.system(cmd) for cmd in cmds] + + + ###################################################################### # Parse MSP file and run metfrag CLI ###################################################################### @@ -236,100 +302,57 @@ paramds = {} # keep count of spectra (for uid) spectrac = 0 +# this dictionary will store the meta data results form the MSp file +meta_info = {} with open(args.input_pth, "r") as infile: - numlines = 0 + # number of lines for the peaks + pnumlines = 0 + # number of lines read for the peaks + plinesread = 0 for line in infile: line = line.strip() - print(line) - if numlines == 0: + + if pnumlines == 0: # =============== Extract metadata from MSP ======================== meta_info = parse_meta(meta_regex, meta_info) - print(meta_info) + if ('massbank' in meta_info and 'cols' in meta_info) or ('msp' in meta_info and 'num_peaks' in meta_info): - print('check') - numlines = int(meta_info['num_peaks']) - linesread = 0 + + pnumlines = int(meta_info['num_peaks']) + plinesread = 0 peaklist = [] - elif linesread < numlines: + elif plinesread < pnumlines: # =============== Extract peaks from MSP ========================== line = tuple(line.split()) # .split() will split on any empty space (i.e. tab and space) # Keep only m/z and intensity, not relative intensity save_line = tuple(line[0].split() + line[1].split()) - linesread += 1 + plinesread += 1 peaklist.append(save_line) - elif linesread == numlines: + elif plinesread and plinesread == pnumlines: # =============== Get sample name and additional details for output ======= - # use a unique uuid4 to keep track of processing (important for multicore) - #rd = str(uuid.uuid4()) spectrac += 1 - - # Get sample details (if possible to extract) e.g. if created as part of the msPurity pipeline) - # choose between getting additional details to add as columns as either all meta data from msp, just - # details from the record name (i.e. when using msPurity and we have the columns coded into the name) or - # just the spectra index (spectrac) - if args.meta_select_col == 'name': - # have additional column of just the name - paramd['additional_details'] = {'name': meta_info['name']} - elif args.meta_select_col == 'name_split': - # have additional columns split by "|" and then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1 - sampled = {sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in - meta_info['name'].split("|")} - elif args.meta_select_col == 'all': - # have additional columns based on all the meta information extracted from the MSP - paramd['additional_details'] = meta_info - else: - # Just have and index of the spectra in the MSP file - paramd['additional_details'] = {'spectra_idx': spectrac} - - paramd["SampleName"] = "{}_metfrag_result".format(spectrac) + paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types) - # =============== Output peaks to txt file ============================== - numlines = 0 - paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac)) - print(paramd["PeakListPath"]) - # write spec file - with open(paramd["PeakListPath"], 'w') as outfile: - for p in peaklist: - outfile.write(p[0] + "\t" + p[1] + "\n") - - # =============== Update param based on MSP metadata ====================== - # Replace param details with details from MSP if required - if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types: - - nm = float(meta_info['precursor_mz']) + adduct_types[meta_info['precursor_type']] - paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0)) - else: - - paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] - nm = float(meta_info['precursor_mz']) + paramd['nm_mass_diff_default'] - - paramd["NeutralPrecursorMass"] = nm - - # =============== Create CLI cmd for metfrag =============================== - cmd = "metfrag" - for k, v in six.iteritems(paramd): - if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']: - cmd += " {}={}".format(str(k), str(v)) paramds[paramd["SampleName"]] = paramd - - # =============== Run metfrag ============================================== - # Filter before process with a minimum number of MS/MS peaks - if linesread >= float(args.minMSMSpeaks): - - if int(args.cores_top_level) > 1: - cmds.append(cmd) - else: - print(cmd) - os.system(cmd) + cmds.append(cmd) meta_info = {} + pnumlines = 0 + plinesread = 0 + + # end of file. Check if there is a MSP spectra to run metfrag on still + if plinesread and plinesread == pnumlines: + + paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac+1, adduct_types) + + paramds[paramd["SampleName"]] = paramd + cmds.append(cmd) -def work(cmds): - return [os.system(cmd) for cmd in cmds] + # Perform multiprocessing on command line call level @@ -372,6 +395,13 @@ headers = additional_detail_headers + sorted(list(set(headers))) + +# Sort files nicely +outfiles.sort(key = lambda s: int(re.match('^.*/(\d+)_metfrag_result.csv', s).group(1))) + +print(outfiles) + + # merge outputs with open(args.result_pth, 'a') as merged_outfile: dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, delimiter='\t', quotechar='"',
