Mercurial > repos > iracooke > proteomics_datatypes
changeset 0:7101f7e4b00b
Uploaded
author | iracooke |
---|---|
date | Wed, 08 May 2013 03:25:50 -0400 |
parents | |
children | 4198976f4e5e |
files | README datatypes_conf.xml display_applications/proteomics/PepXml.xml display_applications/proteomics/ProtGff.xml display_applications/proteomics/ProtXml.xml display_applications/proteomics/mzML.xml proteomics.py tool-data/proteogenomics_display_site.txt.sample tool-data/protk_display_site.txt.sample |
diffstat | 9 files changed, 415 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Wed May 08 03:25:50 2013 -0400 @@ -0,0 +1,9 @@ +## What is it? +Galaxy datatype and display-application definitions for Proteomics data + +## Installation +Install into your local galaxy instance from the main galaxy toolshed at http://toolshed.g2.bx.psu.edu/ + +To visualize data you will need to install the protviz visualization web application. This is available at +[https://bitbucket.org/Andrew_Brock/proteomics-visualise](https://bitbucket.org/Andrew_Brock/proteomics-visualise) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Wed May 08 03:25:50 2013 -0400 @@ -0,0 +1,52 @@ +<?xml version="1.0"?> +<datatypes> + <datatype_files> + <datatype_file name="proteomics.py"/> + </datatype_files> + <registration display_path="display_applications"> + <datatype extension="prot_gff" type="galaxy.datatypes.proteomics:ProtGff" mimetype="application/xml" display_in_upload="true"> + <display file="proteomics/ProtGff.xml" /> + </datatype> + <datatype extension="pepxml" type="galaxy.datatypes.proteomics:PepXml" mimetype="application/xml" display_in_upload="true"> + <display file="proteomics/PepXml.xml" /> + </datatype> + <datatype extension="raw_pepxml" type="galaxy.datatypes.proteomics:PepXml" subclass="true"> + <display file="proteomics/PepXml.xml" /> + </datatype> + <datatype extension="peptideprophet_pepxml" type="galaxy.datatypes.proteomics:PepXml" subclass="true"> + <display file="proteomics/PepXml.xml" /> + </datatype> + <datatype extension="interprophet_pepxml" type="galaxy.datatypes.proteomics:PepXml" subclass="true"> + <display file="proteomics/PepXml.xml" /> + </datatype> + <datatype extension="protxml" type="galaxy.datatypes.proteomics:ProtXML" display_in_upload="true" > + <display file="proteomics/ProtXml.xml"/> + </datatype> + <datatype extension="mascotdat" type="galaxy.datatypes.proteomics:MascotDat" display_in_upload="false" /> + <datatype extension="mzml" type="galaxy.datatypes.proteomics:MzML" mimetype="application/xml" display_in_upload="true"> + <display file="proteomics/mzML.xml"/> + </datatype> + <datatype extension="mgf" type="galaxy.datatypes.proteomics:Mgf" display_in_upload="true" /> + <datatype extension="xls" type="galaxy.datatypes.proteomics:Xls" display_in_upload="true" /> + <datatype extension="mzxml" type="galaxy.datatypes.proteomics:MzXML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="mzq" type="galaxy.datatypes.proteomics:MzQuantML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="mzid" type="galaxy.datatypes.proteomics:MzIdentML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="traML" type="galaxy.datatypes.proteomics:TraML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="raw" type="galaxy.datatypes.proteomics:RAW" display_in_upload="true" /> + <datatype extension="msp" type="galaxy.datatypes.proteomics:Msp" display_in_upload="true" /> + <datatype extension="ms2" type="galaxy.datatypes.proteomics:Ms2" display_in_upload="true" /> + <datatype extension="hlf" type="galaxy.datatypes.proteomics:XHunterAslFormat" display_in_upload="true" /> + </registration> + <sniffers> + <sniffer type="galaxy.datatypes.proteomics:ProtGff"/> + <sniffer type="galaxy.datatypes.proteomics:MzML"/> + <sniffer type="galaxy.datatypes.proteomics:PepXml"/> + <sniffer type="galaxy.datatypes.proteomics:Mgf"/> + <sniffer type="galaxy.datatypes.proteomics:ProtXML"/> + <sniffer type="galaxy.datatypes.proteomics:MzXML"/> + <sniffer type="galaxy.datatypes.proteomics:TraML"/> + <sniffer type="galaxy.datatypes.proteomics:MzIdentML"/> + <sniffer type="galaxy.datatypes.proteomics:MzQuantML"/> + <sniffer type="galaxy.datatypes.proteomics:Xls"/> + </sniffers> +</datatypes>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/display_applications/proteomics/PepXml.xml Wed May 08 03:25:50 2013 -0400 @@ -0,0 +1,18 @@ +<display id="proteomics_pepxml" version="1.0.0" name="view pepXML in"> + <dynamic_links from_file="tool-data/protk_display_site.txt" skip_startswith="#" id="0" name="0"> + <!-- Define parameters by column from file --> + <dynamic_param name="site_id" value="0"/> + <dynamic_param name="site_url" value="1"/> + <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name --> + <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&type=pepxml</url> + <param type="data" name="pep_file" viewable="False" format="pepXML"/> + <param type="data" dataset="pep_file" name="pepxml_file" format="pepXML" viewable="False" /> + <param type="template" name="encoded_filename" strip="True" > + #import binascii + ${binascii.hexlify( $pepxml_file.file_name )} + </param> + <param type="template" name="galaxy_url" strip="True" > + ${BASE_URL.split(":")[1][2:]} + </param> + </dynamic_links> +</display>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/display_applications/proteomics/ProtGff.xml Wed May 08 03:25:50 2013 -0400 @@ -0,0 +1,18 @@ +<display id="proteomics_gff" version="1.0.0" name="view gff in"> + <dynamic_links from_file="tool-data/proteogenomics_display_site.txt" skip_startswith="#" id="0" name="0"> + <!-- Define parameters by column from file --> + <dynamic_param name="site_id" value="0"/> + <dynamic_param name="site_url" value="1"/> + <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name --> + <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&type=protgff</url> + <param type="data" name="prot_file" viewable="False" format="protgff"/> + <param type="data" dataset="prot_file" name="protgff_file" format="protgff" viewable="False" /> + <param type="template" name="encoded_filename" strip="True" > + #import binascii + ${binascii.hexlify( $protgff_file.file_name )} + </param> + <param type="template" name="galaxy_url" strip="True" > + ${BASE_URL.split(":")[1][2:]} + </param> + </dynamic_links> +</display> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/display_applications/proteomics/ProtXml.xml Wed May 08 03:25:50 2013 -0400 @@ -0,0 +1,18 @@ +<display id="proteomics_protxml" version="1.0.0" name="view protXML in"> + <dynamic_links from_file="tool-data/protk_display_site.txt" skip_startswith="#" id="0" name="0"> + <!-- Define parameters by column from file --> + <dynamic_param name="site_id" value="0"/> + <dynamic_param name="site_url" value="1"/> + <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name --> + <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&type=protxml</url> + <param type="data" name="prot_file" viewable="False" format="protXML"/> + <param type="data" dataset="prot_file" name="protxml_file" format="protXML" viewable="False" /> + <param type="template" name="encoded_filename" strip="True" > + #import binascii + ${binascii.hexlify( $protxml_file.file_name )} + </param> + <param type="template" name="galaxy_url" strip="True" > + ${BASE_URL.split(":")[1][2:]} + </param> + </dynamic_links> +</display> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/display_applications/proteomics/mzML.xml Wed May 08 03:25:50 2013 -0400 @@ -0,0 +1,18 @@ +<display id="proteomics_mzml" version="1.0.2" name="view mzML data"> + <dynamic_links from_file="tool-data/protk_display_site.txt" skip_startswith="#" id="0" name="0"> + <!-- Define parameters by column from file --> + <dynamic_param name="site_id" value="0"/> + <dynamic_param name="site_url" value="1"/> + <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name --> + <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&type=mzml</url> + <param type="data" name="raw_file" viewable="False" format="mzML"/> + <param type="data" dataset="raw_file" name="mzml_file" format="mzML" viewable="False" /> + <param type="template" name="encoded_filename" strip="True" > + #import binascii + ${binascii.hexlify( $mzml_file.file_name )} + </param> + <param type="template" name="galaxy_url" strip="True" > + ${BASE_URL.split(":")[1][2:]} + </param> + </dynamic_links> +</display> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/proteomics.py Wed May 08 03:25:50 2013 -0400 @@ -0,0 +1,276 @@ +""" +Proteomics format classes +""" +import logging +import re +from galaxy.datatypes.data import * +from galaxy.datatypes.xml import * +from galaxy.datatypes.sniff import * +from galaxy.datatypes.binary import * +from galaxy.datatypes.interval import * + +log = logging.getLogger(__name__) + +class ProtGff( Gff ): + """Tab delimited data in Gff format""" + file_ext = "prot_gff" + def set_peek( self, dataset, is_multi_byte=False ): + """Set the peek and blurb text""" + if not dataset.dataset.purged: + dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + dataset.blurb = 'Proteogenomics GFF' + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff( self, filename ): + handle = open(filename) + xmlns_re = re.compile("^##gff-version") + for i in range(3): + line = handle.readline() + if xmlns_re.match(line.strip()): + handle.close() + return True + + handle.close() + return False + + +class Xls( Binary ): + """Class describing a binary excel spreadsheet file""" + file_ext = "xls" + + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = "Excel Spreadsheet file" + dataset.blurb = data.nice_size( dataset.get_size() ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): + try: + return dataset.peek + except: + return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) + +class ProteomicsXml(GenericXml): + """ An enhanced XML datatype used to reuse code across several + proteomic/mass-spec datatypes. """ + + def sniff(self, filename): + """ Determines whether the file is the correct XML type. """ + with open(filename, 'r') as contents: + while True: + line = contents.readline() + if line == None or not line.startswith('<?'): + break + pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string + return line != None and re.match(pattern, line) != None + + def set_peek( self, dataset, is_multi_byte=False ): + """Set the peek and blurb text""" + if not dataset.dataset.purged: + dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + dataset.blurb = self.blurb + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + +class PepXml(ProteomicsXml): + """pepXML data""" + file_ext = "pepxml" + blurb = 'pepXML data' + root = "msms_pipeline_analysis" + + +class MzML(ProteomicsXml): + """mzML data""" + file_ext = "mzml" + blurb = 'mzML Mass Spectrometry data' + root = "(mzML|indexedmzML)" + + +class ProtXML(ProteomicsXml): + """protXML data""" + file_ext = "protxml" + blurb = 'prot XML Search Results' + root = "protein_summary" + + +class MzXML(ProteomicsXml): + """mzXML data""" + file_ext = "mzXML" + blurb = "mzXML Mass Spectrometry data" + root = "mzXML" + +## PSI datatypes +class MzIdentML(ProteomicsXml): + file_ext = "mzid" + blurb = "XML identified peptides and proteins." + root = "MzIdentML" + + +class TraML(ProteomicsXml): + file_ext = "traML" + blurb = "TraML transition list" + root = "TraML" + + +class MzQuantML(ProteomicsXml): + file_ext = "mzq" + blurb = "XML quantification data" + root = "MzQuantML" + + +class Mgf( Text ): + """Mascot Generic Format data""" + file_ext = "mgf" + + def set_peek( self, dataset, is_multi_byte=False ): + """Set the peek and blurb text""" + if not dataset.dataset.purged: + dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + dataset.blurb = 'mgf Mascot Generic Format' + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + + def sniff( self, filename ): + mgf_begin_ions = "BEGIN IONS" + max_lines=100 + + for i, line in enumerate( file( filename ) ): + line = line.rstrip( '\n\r' ) + if line==mgf_begin_ions: + return True + if i>max_lines: + return False + + +class MascotDat( Text ): + """Mascot search results """ + file_ext = "mascotdat" + + def set_peek( self, dataset, is_multi_byte=False ): + """Set the peek and blurb text""" + if not dataset.dataset.purged: + dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + dataset.blurb = 'mascotdat Mascot Search Results' + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + + def sniff( self, filename ): + mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)" + max_lines=10 + + for i, line in enumerate( file( filename ) ): + line = line.rstrip( '\n\r' ) + if line==mime_version: + return True + if i>max_lines: + return False + + +class RAW( Binary ): + """Class describing a Thermo Finnigan binary RAW file""" + file_ext = "raw" + def sniff( self, filename ): + # Thermo Finnigan RAW format is proprietary and hence not well documented. + # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n + # This combination represents 17 bytes, but to play safe we read 20 bytes from + # the start of the file. + try: + header = open( filename ).read(20) + hexheader = binascii.b2a_hex( header ) + finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' ) + if hexheader.find(finnigan) != -1: + return True + return False + except: + return False + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = "Thermo Finnigan RAW file" + dataset.blurb = data.nice_size( dataset.get_size() ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): + try: + return dataset.peek + except: + return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) + + +if hasattr(Binary, 'register_sniffable_binary_format'): + Binary.register_sniffable_binary_format('RAW', 'RAW', RAW) + + +class Msp(Text): + """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ + file_ext = "msp" + + @staticmethod + def next_line_starts_with(contents, prefix): + next_line = contents.readline() + return next_line != None and next_line.startswith(prefix) + + def sniff(self, filename): + """ Determines whether the file is a NIST MSP output file. + + >>> fname = get_test_fname('test.msp') + >>> Msp().sniff(fname) + True + >>> fname = get_test_fname('test.mzXML') + >>> Msp().sniff(fname) + False + """ + with open(filename, 'r') as contents: + return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:") + +class Ms2(Text): + file_ext = "ms2" + + def sniff(self, filename): + """ Determines whether the file is a valid ms2 file. + + >>> fname = get_test_fname('test.msp') + >>> Ms2().sniff(fname) + False + >>> fname = get_test_fname('test.ms2') + >>> Ms2().sniff(fname) + True + """ + + with open(filename, 'r') as contents: + header_lines = [] + while True: + line = contents.readline() + if line == None or len(line) == 0: + pass + elif line.startswith('H\t'): + header_lines.append(line) + else: + break + for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']: + found_header = False + for header_line in header_lines: + if header_line.startswith('H\t%s' % (header_field)): + found_header = True + break + if not found_header: + return False + + return True + +# unsniffable binary format, should do something about this +class XHunterAslFormat(Binary): + """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ + file_ext = "hlf" + + +if hasattr(Binary, 'register_unsniffable_binary_ext'): + Binary.register_unsniffable_binary_ext('hlf')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteogenomics_display_site.txt.sample Wed May 08 03:25:50 2013 -0400 @@ -0,0 +1,3 @@ +#Proteomic Visualization application should be hosted on the same server as galaxy +#Entries in this file are of the format "site_id" site_url +Proteogenomics Browser http://127.0.0.1:8600
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/protk_display_site.txt.sample Wed May 08 03:25:50 2013 -0400 @@ -0,0 +1,3 @@ +#Proteomic Visualization application should be hosted on the same server as galaxy +#Entries in this file are of the format "site_id" site_url +Proteomics Visualize http://127.0.0.1:8500