diff metfrag.py @ 1:c1b168770b68 draft

planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit 5b384bd55e5bb4b0dc1daebd2ef5d3ee0e379b2e-dirty
author tomnl
date Fri, 31 May 2019 04:41:09 -0400
parents 75c805123b45
children d040e27b6225
line wrap: on
line diff
--- a/metfrag.py	Tue Jul 24 07:59:44 2018 -0400
+++ b/metfrag.py	Fri May 31 04:41:09 2019 -0400
@@ -1,117 +1,417 @@
+from __future__ import absolute_import, print_function
 import argparse
 import csv
 import os
 import sys
+import six
+import re
+import random
+import string
+import shutil
+import glob
+import tempfile
+import multiprocessing
+
+from subprocess import call
+from collections import defaultdict
+
 print(sys.version)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--input')
-parser.add_argument('--db_local')
-parser.add_argument('--db_online')
-parser.add_argument('--ppm')
-parser.add_argument('--ppm_frag')
-parser.add_argument('--fragmasstol')
-parser.add_argument('--polarity')
-parser.add_argument('--results')
-parser.add_argument('--threads')
+parser.add_argument('--input_pth')
+parser.add_argument('--result_pth', default='metfrag_result.csv')
+
+parser.add_argument('--temp_dir')
+parser.add_argument('--polarity', default='pos')
+parser.add_argument('--minMSMSpeaks', default=1)
+
+parser.add_argument('--MetFragDatabaseType', default='PubChem')
+parser.add_argument('--LocalDatabasePath', default='')
+parser.add_argument('--LocalMetChemDatabaseServerIp', default='')
+
+parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5)
+parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10)
+parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', default=0.001)
+parser.add_argument('--NumberThreads', default=1)
+parser.add_argument('--UnconnectedCompoundFilter', action='store_true')
+parser.add_argument('--IsotopeFilter', action='store_true')
+
+parser.add_argument('--FilterMinimumElements', default='')
+parser.add_argument('--FilterMaximumElements', default='')
+parser.add_argument('--FilterSmartsInclusionList', default='')
+parser.add_argument('--FilterSmartsExclusionList', default='')
+parser.add_argument('--FilterIncludedElements', default='')
+parser.add_argument('--FilterExcludedElements', default='')
+parser.add_argument('--FilterIncludedExclusiveElements', default='')
+
+parser.add_argument('--score_thrshld', default=0)
+parser.add_argument('--pctexplpeak_thrshld', default=0)
+parser.add_argument('--schema')
+parser.add_argument('--cores_top_level', default=1)
+parser.add_argument('--chunks', default=1)
+parser.add_argument('--meta_select_col', default='name')
+
+parser.add_argument('--ScoreSuspectLists', default='')
+parser.add_argument('--MetFragScoreTypes', default="FragmenterScore,OfflineMetFusionScore")
+parser.add_argument('--MetFragScoreWeights', default="1.0,1.0")
 
 args = parser.parse_args()
-print args
+print(args)
+
+# Create temporary working directory
+if args.temp_dir:
+    wd = args.temp_dir
+else:
+    wd = tempfile.mkdtemp()
+
+if os.path.exists(wd):
+    shutil.rmtree(wd)
+    os.makedirs(wd)
+else:
+    os.makedirs(wd)
+
+######################################################################
+# Setup parameter dictionary
+######################################################################
+paramd = defaultdict()
+
+paramd["MetFragDatabaseType"] = args.MetFragDatabaseType
+
+if args.MetFragDatabaseType == "LocalCSV":
+    paramd["LocalDatabasePath"] = args.LocalDatabasePath
+elif args.MetFragDatabaseType == "MetChem":
+    paramd["LocalMetChemDatabase"] = "metchem"
+    paramd["LocalMetChemDatabasePortNumber"] = 5432
+    paramd["LocalMetChemDatabaseServerIp"] = args.LocalMetChemDatabaseServerIp
+    paramd["LocalMetChemDatabaseUser"] = "metchemro"
+    paramd["LocalMetChemDatabasePassword"] = "metchemro"
+
+paramd["FragmentPeakMatchAbsoluteMassDeviation"] = args.FragmentPeakMatchAbsoluteMassDeviation
+paramd["FragmentPeakMatchRelativeMassDeviation"] = args.FragmentPeakMatchRelativeMassDeviation
+paramd["DatabaseSearchRelativeMassDeviation"] = args.DatabaseSearchRelativeMassDeviation
+paramd["SampleName"] = ''
+paramd["ResultsPath"] = os.path.join(wd)
+
+if args.polarity == "pos":
+    paramd["IsPositiveIonMode"] = True
+    paramd["PrecursorIonModeDefault"] = "1"
+    paramd["PrecursorIonMode"] = "1"
+    paramd["nm_mass_diff_default"] = 1.007276
+else:
+    paramd["IsPositiveIonMode"] = False
+    paramd["PrecursorIonModeDefault"] = "-1"
+    paramd["PrecursorIonMode"] = "-1"
+    paramd["nm_mass_diff_default"] = -1.007276
+
+paramd["MetFragCandidateWriter"] = "CSV"
+paramd["NumberThreads"] = args.NumberThreads
+
+if args.ScoreSuspectLists:
+    paramd["ScoreSuspectLists"] = args.ScoreSuspectLists
+
+paramd["MetFragScoreTypes"] = args.MetFragScoreTypes
+paramd["MetFragScoreWeights"] = args.MetFragScoreWeights
+
+dct_filter = defaultdict()
+filterh = []
+
+if args.UnconnectedCompoundFilter:
+    filterh.append('UnconnectedCompoundFilter')
+
+if args.IsotopeFilter:
+    filterh.append('IsotopeFilter')
+
+if args.FilterMinimumElements:
+    filterh.append('MinimumElementsFilter')
+    dct_filter['FilterMinimumElements'] = args.FilterMinimumElements
+
+if args.FilterMaximumElements:
+    filterh.append('MaximumElementsFilter')
+    dct_filter['FilterMaximumElements'] = args.FilterMaximumElements
+
+if args.FilterSmartsInclusionList:
+    filterh.append('SmartsSubstructureInclusionFilter')
+    dct_filter['FilterSmartsInclusionList'] = args.FilterSmartsInclusionList
+
+if args.FilterSmartsExclusionList:
+    filterh.append('SmartsSubstructureExclusionFilter')
+    dct_filter['FilterSmartsExclusionList'] = args.FilterSmartsExclusionList
+
+# My understanding is that both 'ElementInclusionExclusiveFilter' and 'ElementExclusionFilter' use
+# 'FilterIncludedElements'
+if args.FilterIncludedExclusiveElements:
+    filterh.append('ElementInclusionExclusiveFilter')
+    dct_filter['FilterIncludedElements'] = args.FilterIncludedExclusiveElements
+
+if args.FilterIncludedElements:
+    filterh.append('ElementInclusionFilter')
+    dct_filter['FilterIncludedElements'] = args.FilterIncludedElements
 
-os.makedirs("tmet")
+if args.FilterExcludedElements:
+    filterh.append('ElementExclusionFilter')
+    dct_filter['FilterExcludedElements'] = args.FilterExcludedElements
+
+if filterh:
+    fcmds = ','.join(filterh) + ' '
+    for k, v in six.iteritems(dct_filter):
+        fcmds += "{0}={1} ".format(str(k), str(v))
+
+    paramd["MetFragPreProcessingCandidateFilter"] = fcmds
+
+print(paramd)
+
+######################################################################
+# Setup regular expressions for MSP parsing dictionary
+######################################################################
+regex_msp = {}
+regex_msp['name'] = ['^Name(?:=|:)(.*)$']
+regex_msp['polarity'] = ['^ion.*mode(?:=|:)(.*)$', '^ionization.*mode(?:=|:)(.*)$', '^polarity(?:=|:)(.*)$']
+regex_msp['precursor_mz'] = ['^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', '^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
+regex_msp['precursor_type'] = ['^precursor.*type(?:=|:)(.*)$', '^adduct(?:=|:)(.*)$', '^ADDUCTIONNAME(?:=|:)(.*)$']
+regex_msp['num_peaks'] = ['^Num.*Peaks(?:=|:)\s*(\d*)$']
+regex_msp['msp'] = ['^Name(?:=|:)(.*)$']  # Flag for standard MSP format
+
+regex_massbank = {}
+regex_massbank['name'] = ['^RECORD_TITLE:(.*)$']
+regex_massbank['polarity'] = ['^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
+regex_massbank['precursor_mz'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
+regex_massbank['precursor_type'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
+regex_massbank['num_peaks'] = ['^PK\$NUM_PEAK:\s+(\d*)']
+regex_massbank['cols'] = ['^PK\$PEAK:\s+(.*)']
+regex_massbank['massbank'] = ['^RECORD_TITLE:(.*)$']  # Flag for massbank format
 
-with open(args.input,"r") as infile:
+if args.schema == 'msp':
+    meta_regex = regex_msp
+elif args.schema == 'massbank':
+    meta_regex = regex_massbank
+elif args.schema == 'auto':
+    # If auto we just check for all the available paramter names and then determine if Massbank or MSP based on
+    # the name parameter
+    meta_regex = {}
+    meta_regex.update(regex_massbank)
+    meta_regex['name'].extend(regex_msp['name'])
+    meta_regex['polarity'].extend(regex_msp['polarity'])
+    meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
+    meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
+    meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
+    meta_regex['msp'] = regex_msp['msp']
+
+    print(meta_regex)
+
+
+
+# this dictionary will store the meta data results form the MSp file
+meta_info = {}
+
+
+# function to extract the meta data using the regular expressions
+def parse_meta(meta_regex, meta_info={}):
+    for k, regexes in six.iteritems(meta_regex):
+        for reg in regexes:
+            m = re.search(reg, line, re.IGNORECASE)
+            if m:
+                meta_info[k] = '-'.join(m.groups()).strip()
+    return meta_info
+
+
+adduct_types = {
+    '[M+H]+': 1.007276,
+    '[M+NH4]+': 18.034374,
+    '[M+Na]+': 22.989218,
+    '[M+K]+': 38.963158,
+    '[M+CH3OH+H]+': 33.033489,
+    '[M+ACN+H]+': 42.033823,
+    '[M+ACN+Na]+': 64.015765,
+    '[M+2ACN+H]+': 83.06037,
+    '[M-H]-': -1.007276,
+    '[M+Cl]-': 34.969402,
+}
+
+######################################################################
+# Parse MSP file and run metfrag CLI
+######################################################################
+# keep list of commands if performing in CLI in parallel
+cmds = []
+# keep a dictionary of all params
+paramds = {}
+# keep count of spectra (for uid)
+spectrac = 0
+
+with open(args.input_pth, "r") as infile:
     numlines = 0
     for line in infile:
         line = line.strip()
+        print(line)
         if numlines == 0:
-            if "NAME" in line:
-                featid = line.split("NAME: ")[1]
-            if "PRECURSORMZ" in line:
-                mz = float(line.split("PRECURSORMZ: ")[1])
-                if args.polarity=="pos":
-                    mz2 = mz-1.007276
-                else:
-                    mz2 = mz+1.007276
-            if "Num Peaks" in line:
-                numlines = int(line.split("Num Peaks: ")[1])
+            # =============== Extract metadata from MSP ========================
+            meta_info = parse_meta(meta_regex, meta_info)
+            print(meta_info)
+            if ('massbank' in meta_info and 'cols' in meta_info) or ('msp' in meta_info and 'num_peaks' in meta_info):
+                print('check')
+                numlines = int(meta_info['num_peaks'])
                 linesread = 0
                 peaklist = []
-        else:
-            if linesread == numlines:
-                numlines = 0
-                #write spec file
-                with open('./tmpspec.txt', 'w') as outfile:
-                    for p in peaklist:
-                        outfile.write(p[0]+"\t"+p[1]+"\n")
-                #create commandline input
-                cmd_command = "PeakListPath=tmpspec.txt "
-                if args.db_local != "None":
-                    cmd_command += "MetFragDatabaseType=LocalCSV "
-                    cmd_command += "LocalDatabasePath={0} ".format(args.db_local)
+
+        elif linesread < numlines:
+            # =============== Extract peaks from MSP ==========================
+            line = tuple(line.split())  # .split() will split on any empty space (i.e. tab and space)
+            # Keep only m/z and intensity, not relative intensity
+            save_line = tuple(line[0].split() + line[1].split())
+            linesread += 1
+            peaklist.append(save_line)
+
+        elif linesread == numlines:
+            # =============== Get sample name and additional details for output =======
+            # use a unique uuid4 to keep track of processing (important for multicore)
+            #rd = str(uuid.uuid4())
+            spectrac += 1
+
+            # Get sample details (if possible to extract) e.g. if created as part of the msPurity pipeline)
+            # choose between getting additional details to add as columns as either all meta data from msp, just
+            # details from the record name (i.e. when using msPurity and we have the columns coded into the name) or
+            # just the spectra index (spectrac)
+            if args.meta_select_col == 'name':
+                # have additional column of just the name
+                paramd['additional_details'] = {'name': meta_info['name']}
+            elif args.meta_select_col == 'name_split':
+                # have additional columns split by "|" and then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1
+                sampled = {sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in
+                               meta_info['name'].split("|")}
+            elif args.meta_select_col == 'all':
+                # have additional columns based on all the meta information extracted from the MSP
+                paramd['additional_details'] = meta_info
+            else:
+                # Just have and index of the spectra in the MSP file
+                paramd['additional_details'] = {'spectra_idx': spectrac}
+
+            paramd["SampleName"] = "{}_metfrag_result".format(spectrac)
+
+            # =============== Output peaks to txt file  ==============================
+            numlines = 0
+            paramd["PeakListPath"] = os.path.join(wd, "{}_tmpspec.txt".format(spectrac))
+            print(paramd["PeakListPath"])
+            # write spec file
+            with open(paramd["PeakListPath"], 'w') as outfile:
+                for p in peaklist:
+                    outfile.write(p[0] + "\t" + p[1] + "\n")
+
+            # =============== Update param based on MSP metadata ======================
+            # Replace param details with details from MSP if required
+            if 'precursor_type' in meta_info and meta_info['precursor_type'] in adduct_types:
+
+                nm = float(meta_info['precursor_mz']) + adduct_types[meta_info['precursor_type']]
+                paramd["PrecursorIonMode"] = int(round(adduct_types[meta_info['precursor_type']], 0))
+            else:
+
+                paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault']
+                nm = float(meta_info['precursor_mz']) + paramd['nm_mass_diff_default']
+
+            paramd["NeutralPrecursorMass"] = nm
+
+            # =============== Create CLI cmd for metfrag ===============================
+            cmd = "metfrag"
+            for k, v in six.iteritems(paramd):
+                if not k in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 'additional_details']:
+                    cmd += " {}={}".format(str(k), str(v))
+            paramds[paramd["SampleName"]] = paramd
+
+            # =============== Run metfrag ==============================================
+            # Filter before process with a minimum number of MS/MS peaks
+            if linesread >= float(args.minMSMSpeaks):
+
+                if int(args.cores_top_level) > 1:
+                    cmds.append(cmd)
                 else:
-                    cmd_command += "MetFragDatabaseType={0} ".format(args.db_online)
-                cmd_command += "FragmentPeakMatchAbsoluteMassDeviation={0} ".format(args.fragmasstol)
-                cmd_command += "FragmentPeakMatchRelativeMassDeviation={0} ".format(args.ppm_frag)
-                cmd_command += "DatabaseSearchRelativeMassDeviation={0} ".format(args.ppm)
-                cmd_command += "NeutralPrecursorMass={0} ".format(mz2)
-                cmd_command += "SampleName={0}_metfrag ".format(featid)
-                cmd_command += "ResultsPath=./tmet/ "
-                if args.polarity == "pos":
-                    cmd_command += "IsPositiveIonMode=True "
-                else:
-                    cmd_command += "IsPositiveIonMode=False "
-                if args.polarity == "pos": ### Annotation information. Create a dict for the PrecurorIonModes??
-                    cmd_command += "PrecursorIonMode=1 "
-                else:
-                    cmd_command += "PrecursorIonMode=-1 "
-                cmd_command += "MetFragCandidateWriter=CSV " ## TSV not available
-                cmd_command += "NumberThreads={} ".format(args.threads)
-                # run Metfrag
-                print "metfrag {0}".format(cmd_command)
-                os.system("metfrag {0}".format(cmd_command))
-            else:
-                line = tuple(line.split("\t"))
-                linesread += 1
-                peaklist.append(line)
+                    print(cmd)
+                    os.system(cmd)
+
+            meta_info = {}
 
 
-#outputs might have different headers. Need to get a list of all the headers before we start merging the files
-outfiles = sorted(os.listdir("./tmet"))
+def work(cmds):
+    return [os.system(cmd) for cmd in cmds]
+
+
+# Perform multiprocessing on command line call level
+if int(args.cores_top_level) > 1:
+    cmds_chunks = [cmds[x:x + int(args.chunks)] for x in list(range(0, len(cmds), int(args.chunks)))]
+    pool = multiprocessing.Pool(processes=int(args.cores_top_level))
+    pool.map(work, cmds_chunks)
+    pool.close()
+    pool.join()
+
+######################################################################
+# Concatenate and filter the output
+######################################################################
+# outputs might have different headers. Need to get a list of all the headers before we start merging the files
+# outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, "*_metfrag_result.csv"))]
+outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv"))
 
 headers = []
 c = 0
-for fname in outfiles:
-    with open("./tmet/"+fname) as infile:
+for fn in outfiles:
+    with open(fn, 'r') as infile:
         reader = csv.reader(infile)
-        headers.extend(reader.next())
+        if sys.version_info >= (3, 0):
+            headers.extend(next(reader))
+        else:
+            headers.extend(reader.next())
         # check if file has any data rows 
         for i, row in enumerate(reader):
-            c+=1
-            if i==1:
+            c += 1
+            if i == 1:
                 break
-            
+
 # if no data rows (e.g. matches) then do not save an output and leave the program        
-if c==0:
+if c == 0:
     sys.exit()
 
-
-print headers
-headers = ['UID'] + sorted(list(set(headers)))
-print headers
+additional_detail_headers = ['sample_name']
+for k, paramd in six.iteritems(paramds):
+    additional_detail_headers = list(set(additional_detail_headers + list(paramd['additional_details'].keys())))
 
-#merge outputs
-with open(args.results, 'a') as merged_outfile:
+headers = additional_detail_headers + sorted(list(set(headers)))
 
-    dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, delimiter='\t')
+# merge outputs
+with open(args.result_pth, 'a') as merged_outfile:
+    dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, delimiter='\t', quotechar='"',
+        quoting=csv.QUOTE_NONNUMERIC,)
     dwriter.writeheader()
 
-    for fname in outfiles:
-        fileid = os.path.basename(fname).split("_")[0]
-        with open("./tmet/"+fname) as infile:
+    for fn in outfiles:
+
+        with open(fn) as infile:
             reader = csv.DictReader(infile, delimiter=',', quotechar='"')
             for line in reader:
-                line['UID'] = fileid
-                dwriter.writerow(line)
+                bewrite = True
+                for key, value in line.items():
+                    # Filter when no MS/MS peak matched
+                    if key == "ExplPeaks":
+                        if float(args.pctexplpeak_thrshld) > 0 and "NA" in value:
+                            bewrite = False
+                    # Filter with a score threshold
+                    elif key == "Score":
+                        if float(value) <= float(args.score_thrshld):
+                            bewrite = False
+                    elif key == "NoExplPeaks":
+                        nbfindpeak = float(value)
+                    elif key == "NumberPeaksUsed":
+                        totpeaks = float(value)
+                # Filter with a relative number of peak matched
+                try:
+                    pctexplpeak = nbfindpeak / totpeaks * 100
+                except ZeroDivisionError:
+                    bewrite = False
+                else:
+                    if pctexplpeak < float(args.pctexplpeak_thrshld):
+                        bewrite = False
 
+                # Write the line if it pass all filters
+                if bewrite:
+                    bfn = os.path.basename(fn)
+                    bfn = bfn.replace(".csv", "")
+                    ad = paramds[bfn]['additional_details']
+                    line.update(ad)
+                    line['sample_name'] = paramds[bfn]['SampleName']
 
+                    dwriter.writerow(line)