Mercurial > repos > jjohnson > spectrast
diff link_scan_datasets.py @ 3:7f02fc51bddf draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/spectrast commit 379705f578f9a0465f497894c7d2b5f68b6a55e6-dirty
author | jjohnson |
---|---|
date | Wed, 25 Jul 2018 10:58:17 -0400 |
parents | |
children | 274fdc50169b |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/link_scan_datasets.py Wed Jul 25 10:58:17 2018 -0400 @@ -0,0 +1,110 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import argparse +import difflib +from difflib import SequenceMatcher +import os +import os.path +import sys +import xml.sax + + + +def __main__(): + parser = argparse.ArgumentParser( + description='link spectrum datasets to the name used' + + ' in the identification dataset') + parser.add_argument( + 'ident_files', nargs='+', + help='Pepxml or mzIdentML') + parser.add_argument( + '-n', '--scan_name', default=[], action='append', + help='Name for scan file') + parser.add_argument( + '-f', '--scan_file', default=[], action='append', + help='Path for scan file') + args = parser.parse_args() + + class MzidHandler( xml.sax.ContentHandler): + def __init__(self): + xml.sax.ContentHandler.__init__(self) + self.spectraDataFiles = [] + self.spectraDataNames = [] + self.searchDatabaseFiles = [] + self.searchDatabaseNames = [] + def startElement(self, tag, attrs): + if tag == 'SpectraData': + id = attrs['id'] + path = attrs['location'] + filename = os.path.basename(path) + name = attrs['name'] if 'name' in attrs else None + self.spectraDataFiles.append(filename) + self.spectraDataNames.append(name if name else id) + print ("SpectraData: %s %s" % (name if name else id, path)) + if tag == 'SearchDatabase': + id = attrs['id'] + path = attrs['location'] + filename = os.path.basename(path) + name = attrs['name'] if 'name' in attrs else None + self.searchDatabaseFiles.append(filename) + self.searchDatabaseNames.append(name if name else id) + print ("SearchDatabase: %s %s" % (name if name else id, path)) + def endElement( self, name): + pass + def characters( self, data): + pass + + class PepXmlHandler( xml.sax.ContentHandler): + def __init__(self): + xml.sax.ContentHandler.__init__(self) + self.spectraDataFiles = [] + self.spectraDataNames = [] + def startElement(self, tag, attrs): + if tag == 'msms_run_summary': + basename = attrs['base_name'] + name = os.path.basename(basename) + ext = attrs['raw_data'] + path = '%s%s' % (basename,ext) + filename = os.path.basename(path) + self.spectraDataFiles.append(filename) + self.spectraDataNames.append(name) + print ("SpectraData: %s %s" % (name, path)) + def endElement( self, name): + pass + def characters( self, data): + pass + + parser = xml.sax.make_parser() + parser.setFeature(xml.sax.handler.feature_namespaces, 0) + handler = PepXmlHandler() + parser.setContentHandler( handler ) + for ident in args.ident_files: + parser.parse(ident) + + spectra_names = handler.spectraDataFiles + + def best_match(name): + if name in spectra_names: + return name + try: + r = [SequenceMatcher(None, name, spectra_names[x]).ratio() for x in range(len(spectra_names))] + return spectra_names[r.index(max(r))] + except Exception, e: + print ("best_match: %s %s" % (name, e)) + + for i,name in enumerate(args.scan_name): + path = args.scan_file[i] if len(args.scan_file) > i else '' + (root, ext) = os.path.splitext(name) + print ("SpectraFile: %s %s" % (name, path)) + iname = best_match(name) + print ("IdentName: %s %s" % (name, iname)) + if not os.path.exists(iname) and os.path.exists(path): + os.symlink(path, iname) + print ("%s -> %s" % (iname, path)) + + +if __name__ == "__main__": + __main__() +