Mercurial > repos > iracooke > proteomics_datatypes

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Wed May 08 03:25:50 2013 -0400
@@ -0,0 +1,9 @@
+## What is it?
+Galaxy datatype and display-application definitions for Proteomics data
+
+## Installation
+Install into your local galaxy instance from the main galaxy toolshed at http://toolshed.g2.bx.psu.edu/
+
+To visualize data you will need to install the protviz visualization web application.  This is available at
+[https://bitbucket.org/Andrew_Brock/proteomics-visualise](https://bitbucket.org/Andrew_Brock/proteomics-visualise)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Wed May 08 03:25:50 2013 -0400
@@ -0,0 +1,52 @@
+<?xml version="1.0"?>
+<datatypes>
+  <datatype_files>
+    <datatype_file name="proteomics.py"/>
+  </datatype_files>
+  <registration display_path="display_applications">
+    <datatype extension="prot_gff" type="galaxy.datatypes.proteomics:ProtGff" mimetype="application/xml" display_in_upload="true">
+       <display file="proteomics/ProtGff.xml" />
+    </datatype>
+    <datatype extension="pepxml" type="galaxy.datatypes.proteomics:PepXml" mimetype="application/xml" display_in_upload="true">
+      <display file="proteomics/PepXml.xml" />
+    </datatype>
+    <datatype extension="raw_pepxml" type="galaxy.datatypes.proteomics:PepXml" subclass="true">
+      <display file="proteomics/PepXml.xml" />
+    </datatype>
+    <datatype extension="peptideprophet_pepxml" type="galaxy.datatypes.proteomics:PepXml" subclass="true">
+      <display file="proteomics/PepXml.xml" />
+    </datatype>
+    <datatype extension="interprophet_pepxml" type="galaxy.datatypes.proteomics:PepXml" subclass="true">
+      <display file="proteomics/PepXml.xml" />
+    </datatype>
+    <datatype extension="protxml" type="galaxy.datatypes.proteomics:ProtXML" display_in_upload="true" >
+      <display file="proteomics/ProtXml.xml"/>
+    </datatype>
+    <datatype extension="mascotdat" type="galaxy.datatypes.proteomics:MascotDat" display_in_upload="false" />
+    <datatype extension="mzml" type="galaxy.datatypes.proteomics:MzML" mimetype="application/xml" display_in_upload="true">
+      <display file="proteomics/mzML.xml"/>
+    </datatype>
+    <datatype extension="mgf" type="galaxy.datatypes.proteomics:Mgf" display_in_upload="true" />
+    <datatype extension="xls" type="galaxy.datatypes.proteomics:Xls" display_in_upload="true" />
+    <datatype extension="mzxml" type="galaxy.datatypes.proteomics:MzXML" mimetype="application/xml" display_in_upload="true" />
+    <datatype extension="mzq" type="galaxy.datatypes.proteomics:MzQuantML" mimetype="application/xml" display_in_upload="true" />
+    <datatype extension="mzid" type="galaxy.datatypes.proteomics:MzIdentML" mimetype="application/xml" display_in_upload="true" />
+    <datatype extension="traML" type="galaxy.datatypes.proteomics:TraML" mimetype="application/xml" display_in_upload="true" />
+    <datatype extension="raw" type="galaxy.datatypes.proteomics:RAW" display_in_upload="true" />
+    <datatype extension="msp" type="galaxy.datatypes.proteomics:Msp" display_in_upload="true" />
+    <datatype extension="ms2" type="galaxy.datatypes.proteomics:Ms2" display_in_upload="true" />
+    <datatype extension="hlf" type="galaxy.datatypes.proteomics:XHunterAslFormat" display_in_upload="true" />
+  </registration>
+  <sniffers>
+    <sniffer type="galaxy.datatypes.proteomics:ProtGff"/>
+    <sniffer type="galaxy.datatypes.proteomics:MzML"/>
+    <sniffer type="galaxy.datatypes.proteomics:PepXml"/>
+    <sniffer type="galaxy.datatypes.proteomics:Mgf"/>
+    <sniffer type="galaxy.datatypes.proteomics:ProtXML"/>
+    <sniffer type="galaxy.datatypes.proteomics:MzXML"/>
+    <sniffer type="galaxy.datatypes.proteomics:TraML"/>
+    <sniffer type="galaxy.datatypes.proteomics:MzIdentML"/>
+    <sniffer type="galaxy.datatypes.proteomics:MzQuantML"/>
+    <sniffer type="galaxy.datatypes.proteomics:Xls"/>
+  </sniffers>
+</datatypes>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/display_applications/proteomics/PepXml.xml	Wed May 08 03:25:50 2013 -0400
@@ -0,0 +1,18 @@
+<display id="proteomics_pepxml" version="1.0.0" name="view pepXML in">
+	<dynamic_links from_file="tool-data/protk_display_site.txt" skip_startswith="#" id="0" name="0">
+        <!-- Define parameters by column from file -->
+        <dynamic_param name="site_id" value="0"/>
+        <dynamic_param name="site_url" value="1"/>
+        <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name -->
+        <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&amp;type=pepxml</url>
+        <param type="data" name="pep_file" viewable="False" format="pepXML"/>
+        <param type="data" dataset="pep_file" name="pepxml_file" format="pepXML" viewable="False" />
+        <param type="template" name="encoded_filename" strip="True" >
+            #import binascii
+            ${binascii.hexlify( $pepxml_file.file_name )}
+        </param>
+        <param type="template" name="galaxy_url" strip="True" >
+                ${BASE_URL.split(":")[1][2:]}
+        </param>
+    </dynamic_links>
+</display>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/display_applications/proteomics/ProtGff.xml	Wed May 08 03:25:50 2013 -0400
@@ -0,0 +1,18 @@
+<display id="proteomics_gff" version="1.0.0" name="view gff in">
+        <dynamic_links from_file="tool-data/proteogenomics_display_site.txt" skip_startswith="#" id="0" name="0">
+        <!-- Define parameters by column from file -->
+        <dynamic_param name="site_id" value="0"/>
+        <dynamic_param name="site_url" value="1"/>
+        <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name -->
+        <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&amp;type=protgff</url>
+        <param type="data" name="prot_file" viewable="False" format="protgff"/>
+        <param type="data" dataset="prot_file" name="protgff_file" format="protgff" viewable="False" />
+        <param type="template" name="encoded_filename" strip="True" >
+            #import binascii
+            ${binascii.hexlify( $protgff_file.file_name )}
+        </param>
+        <param type="template" name="galaxy_url" strip="True" >
+                ${BASE_URL.split(":")[1][2:]}
+        </param>
+    </dynamic_links>
+</display>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/display_applications/proteomics/ProtXml.xml	Wed May 08 03:25:50 2013 -0400
@@ -0,0 +1,18 @@
+<display id="proteomics_protxml" version="1.0.0" name="view protXML in">
+	<dynamic_links from_file="tool-data/protk_display_site.txt" skip_startswith="#" id="0" name="0">
+        <!-- Define parameters by column from file -->
+        <dynamic_param name="site_id" value="0"/>
+        <dynamic_param name="site_url" value="1"/>
+        <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name -->
+        <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&amp;type=protxml</url>
+        <param type="data" name="prot_file" viewable="False" format="protXML"/>
+        <param type="data" dataset="prot_file" name="protxml_file" format="protXML" viewable="False" />
+        <param type="template" name="encoded_filename" strip="True" >
+            #import binascii
+            ${binascii.hexlify( $protxml_file.file_name )}
+        </param>
+        <param type="template" name="galaxy_url" strip="True" >
+                ${BASE_URL.split(":")[1][2:]}
+        </param>
+    </dynamic_links>
+</display>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/display_applications/proteomics/mzML.xml	Wed May 08 03:25:50 2013 -0400
@@ -0,0 +1,18 @@
+<display id="proteomics_mzml" version="1.0.2" name="view mzML data">
+	<dynamic_links from_file="tool-data/protk_display_site.txt" skip_startswith="#" id="0" name="0">
+        <!-- Define parameters by column from file -->
+        <dynamic_param name="site_id" value="0"/>
+        <dynamic_param name="site_url" value="1"/>
+        <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name -->
+        <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&amp;type=mzml</url>
+        <param type="data" name="raw_file" viewable="False" format="mzML"/>
+        <param type="data" dataset="raw_file" name="mzml_file" format="mzML" viewable="False" />
+        <param type="template" name="encoded_filename" strip="True" >
+            #import binascii
+            ${binascii.hexlify( $mzml_file.file_name )}
+        </param>
+        <param type="template" name="galaxy_url" strip="True" >
+                ${BASE_URL.split(":")[1][2:]}
+        </param>
+    </dynamic_links>
+</display>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/proteomics.py	Wed May 08 03:25:50 2013 -0400
@@ -0,0 +1,276 @@
+"""
+Proteomics format classes
+"""
+import logging
+import re
+from galaxy.datatypes.data import *
+from galaxy.datatypes.xml import *
+from galaxy.datatypes.sniff import *
+from galaxy.datatypes.binary import *
+from galaxy.datatypes.interval import *
+
+log = logging.getLogger(__name__)
+
+class ProtGff( Gff ):
+    """Tab delimited data in Gff format"""
+    file_ext = "prot_gff"
+    def set_peek( self, dataset, is_multi_byte=False ):
+        """Set the peek and blurb text"""
+        if not dataset.dataset.purged:
+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            dataset.blurb = 'Proteogenomics GFF'
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff( self, filename ):
+        handle = open(filename)
+        xmlns_re = re.compile("^##gff-version")
+        for i in range(3):
+            line = handle.readline()
+            if xmlns_re.match(line.strip()):
+                handle.close()
+                return True
+
+        handle.close()
+        return False
+
+
+class Xls( Binary ):
+    """Class describing a binary excel spreadsheet file"""
+    file_ext = "xls"
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek  = "Excel Spreadsheet file"
+            dataset.blurb = data.nice_size( dataset.get_size() )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+    def display_peek( self, dataset ):
+        try:
+            return dataset.peek
+        except:
+            return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )
+
+class ProteomicsXml(GenericXml):
+    """ An enhanced XML datatype used to reuse code across several
+    proteomic/mass-spec datatypes. """
+
+    def sniff(self, filename):
+        """ Determines whether the file is the correct XML type. """
+        with open(filename, 'r') as contents:
+            while True:
+                line = contents.readline()
+                if line == None or not line.startswith('<?'):
+                    break
+            pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
+            return line != None and re.match(pattern, line) != None
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        """Set the peek and blurb text"""
+        if not dataset.dataset.purged:
+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            dataset.blurb = self.blurb
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+class PepXml(ProteomicsXml):
+    """pepXML data"""
+    file_ext = "pepxml"
+    blurb = 'pepXML data'
+    root = "msms_pipeline_analysis"
+
+
+class MzML(ProteomicsXml):
+    """mzML data"""
+    file_ext = "mzml"
+    blurb = 'mzML Mass Spectrometry data'
+    root = "(mzML|indexedmzML)"
+
+
+class ProtXML(ProteomicsXml):
+    """protXML data"""
+    file_ext = "protxml"
+    blurb = 'prot XML Search Results'
+    root = "protein_summary"
+
+
+class MzXML(ProteomicsXml):
+    """mzXML data"""
+    file_ext = "mzXML"
+    blurb = "mzXML Mass Spectrometry data"
+    root = "mzXML"
+
+## PSI datatypes
+class MzIdentML(ProteomicsXml):
+    file_ext = "mzid"
+    blurb = "XML identified peptides and proteins."
+    root = "MzIdentML"
+
+
+class TraML(ProteomicsXml):
+    file_ext = "traML"
+    blurb = "TraML transition list"
+    root = "TraML"
+
+
+class MzQuantML(ProteomicsXml):
+    file_ext = "mzq"
+    blurb = "XML quantification data"
+    root = "MzQuantML"
+
+
+class Mgf( Text ):
+    """Mascot Generic Format data"""
+    file_ext = "mgf"
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        """Set the peek and blurb text"""
+        if not dataset.dataset.purged:
+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            dataset.blurb = 'mgf Mascot Generic Format'
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+
+    def sniff( self, filename ):
+        mgf_begin_ions = "BEGIN IONS"
+        max_lines=100
+
+        for i, line in enumerate( file( filename ) ):
+            line = line.rstrip( '\n\r' )
+            if line==mgf_begin_ions:
+                return True
+            if i>max_lines:
+                return False
+
+
+class MascotDat( Text ):
+    """Mascot search results """
+    file_ext = "mascotdat"
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        """Set the peek and blurb text"""
+        if not dataset.dataset.purged:
+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            dataset.blurb = 'mascotdat Mascot Search Results'
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+
+    def sniff( self, filename ):
+        mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
+        max_lines=10
+
+        for i, line in enumerate( file( filename ) ):
+            line = line.rstrip( '\n\r' )
+            if line==mime_version:
+                return True
+            if i>max_lines:
+                return False
+
+
+class RAW( Binary ):
+    """Class describing a Thermo Finnigan binary RAW file"""
+    file_ext = "raw"
+    def sniff( self, filename ):
+        # Thermo Finnigan RAW format is proprietary and hence not well documented.
+        # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
+        # This combination represents 17 bytes, but to play safe we read 20 bytes from
+        # the start of the file.
+        try:
+            header = open( filename ).read(20)
+            hexheader = binascii.b2a_hex( header )
+            finnigan  = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
+            if hexheader.find(finnigan) != -1:
+                return True
+            return False
+        except:
+            return False
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek  = "Thermo Finnigan RAW file"
+            dataset.blurb = data.nice_size( dataset.get_size() )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+    def display_peek( self, dataset ):
+        try:
+            return dataset.peek
+        except:
+            return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
+
+
+if hasattr(Binary, 'register_sniffable_binary_format'):
+    Binary.register_sniffable_binary_format('RAW', 'RAW', RAW)
+
+
+class Msp(Text):
+    """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
+    file_ext = "msp"
+
+    @staticmethod
+    def next_line_starts_with(contents, prefix):
+        next_line = contents.readline()
+        return next_line != None and next_line.startswith(prefix)
+
+    def sniff(self, filename):
+        """ Determines whether the file is a NIST MSP output file.
+
+        >>> fname = get_test_fname('test.msp')
+        >>> Msp().sniff(fname)
+        True
+        >>> fname = get_test_fname('test.mzXML')
+        >>> Msp().sniff(fname)
+        False
+        """
+        with open(filename, 'r') as contents:
+            return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
+
+class Ms2(Text):
+    file_ext = "ms2"
+
+    def sniff(self, filename):
+        """ Determines whether the file is a valid ms2 file.
+
+        >>> fname = get_test_fname('test.msp')
+        >>> Ms2().sniff(fname)
+        False
+        >>> fname = get_test_fname('test.ms2')
+        >>> Ms2().sniff(fname)
+        True
+        """
+
+        with open(filename, 'r') as contents:
+            header_lines = []
+            while True:
+                line = contents.readline()
+                if line == None or len(line) == 0:
+                    pass
+                elif line.startswith('H\t'):
+                    header_lines.append(line)
+                else:
+                    break
+        for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
+            found_header = False
+            for header_line in header_lines:
+                if header_line.startswith('H\t%s' % (header_field)):
+                    found_header = True
+                    break
+            if not found_header:
+                return False
+
+        return True
+
+# unsniffable binary format, should do something about this
+class XHunterAslFormat(Binary):
+    """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
+    file_ext = "hlf"
+
+
+if hasattr(Binary, 'register_unsniffable_binary_ext'):
+    Binary.register_unsniffable_binary_ext('hlf')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteogenomics_display_site.txt.sample	Wed May 08 03:25:50 2013 -0400
@@ -0,0 +1,3 @@
+#Proteomic Visualization application should be hosted on the same server as galaxy
+#Entries in this file are of the format "site_id" site_url
+Proteogenomics Browser	http://127.0.0.1:8600
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/protk_display_site.txt.sample	Wed May 08 03:25:50 2013 -0400
@@ -0,0 +1,3 @@
+#Proteomic Visualization application should be hosted on the same server as galaxy
+#Entries in this file are of the format "site_id" site_url
+Proteomics Visualize	http://127.0.0.1:8500