view link_scan_datasets.py @ 5:274fdc50169b draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/spectrast commit e5b5b15b0a995c8588ff62f92fd0a2329fb7a6a5-dirty
author jjohnson
date Wed, 25 Jul 2018 15:05:34 -0400
parents 7f02fc51bddf
children
line wrap: on
line source

#!/usr/bin/env python

from __future__ import print_function

import argparse
import os
import os.path
import xml.sax
from difflib import SequenceMatcher


def __main__():
    parser = argparse.ArgumentParser(
        description='link spectrum datasets to the name used' +
                    ' in the identification dataset')
    parser.add_argument(
        'ident_files', nargs='+',
        help='Pepxml or mzIdentML')
    parser.add_argument(
        '-n', '--scan_name', default=[], action='append',
        help='Name for scan file')
    parser.add_argument(
        '-f', '--scan_file', default=[], action='append',
        help='Path for scan file')
    args = parser.parse_args()

    class MzidHandler(xml.sax.ContentHandler):

        def __init__(self):
            xml.sax.ContentHandler.__init__(self)
            self.spectraDataFiles = []
            self.spectraDataNames = []
            self.searchDatabaseFiles = []
            self.searchDatabaseNames = []

        def startElement(self, tag, attrs):
            if tag == 'SpectraData':
                id = attrs['id']
                path = attrs['location']
                filename = os.path.basename(path)
                name = attrs['name'] if 'name' in attrs else None
                self.spectraDataFiles.append(filename)
                self.spectraDataNames.append(name if name else id)
                print ("SpectraData: %s  %s" % (name if name else id, path))
            if tag == 'SearchDatabase':
                id = attrs['id']
                path = attrs['location']
                filename = os.path.basename(path)
                name = attrs['name'] if 'name' in attrs else None
                self.searchDatabaseFiles.append(filename)
                self.searchDatabaseNames.append(name if name else id)
                print ("SearchDatabase: %s  %s" % (name if name else id, path))

        def endElement(self, name):
            pass

        def characters(self, data):
            pass

    class PepXmlHandler(xml.sax.ContentHandler):

        def __init__(self):
            xml.sax.ContentHandler.__init__(self)
            self.spectraDataFiles = []
            self.spectraDataNames = []

        def startElement(self, tag, attrs):
            if tag == 'msms_run_summary':
                basename = attrs['base_name']
                name = os.path.basename(basename)
                ext = attrs['raw_data']
                path = '%s%s' % (basename, ext)
                filename = os.path.basename(path)
                self.spectraDataFiles.append(filename)
                self.spectraDataNames.append(name)
                print ("SpectraData: %s  %s" % (name, path))

        def endElement(self, name):
            pass

        def characters(self, data):
            pass

    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    handler = PepXmlHandler()
    parser.setContentHandler(handler)
    for ident in args.ident_files:
        parser.parse(ident)

    spectra_names = handler.spectraDataFiles

    def best_match(name):
        if name in spectra_names:
            return name
        try:
            r = [SequenceMatcher(None, name, spectra_names[x]).ratio()
                 for x in range(len(spectra_names))]
            return spectra_names[r.index(max(r))]
        except Exception, e:
            print ("best_match: %s  %s" % (name, e))

    for i, name in enumerate(args.scan_name):
        path = args.scan_file[i] if len(args.scan_file) > i else ''
        (root, ext) = os.path.splitext(name)
        print ("SpectraFile: %s  %s" % (name, path))
        iname = best_match(name)
        print ("IdentName: %s  %s" % (name, iname))
        if not os.path.exists(iname) and os.path.exists(path):
            os.symlink(path, iname)
            print ("%s -> %s" % (iname, path))


if __name__ == "__main__":
    __main__()