comparison proteomics.py @ 3:b22ebbb05260 draft

Uploaded
author iracooke
date Mon, 10 Mar 2014 19:40:20 -0400
parents 7101f7e4b00b
children
comparison
equal deleted inserted replaced
2:7edcae695986 3:b22ebbb05260
1 """ 1 """
2 Proteomics format classes 2 Proteomics format classes
3 """ 3 """
4 import logging 4 import logging
5 import re 5 import re
6 from galaxy.datatypes.data import * 6 import binascii
7 from galaxy.datatypes.xml import * 7
8 from galaxy.datatypes.sniff import * 8 from galaxy.datatypes.sniff import *
9 from galaxy.datatypes.binary import * 9 from galaxy.datatypes import data
10 from galaxy.datatypes.interval import * 10 from galaxy.datatypes.data import Text
11 from galaxy.datatypes.xml import GenericXml
12 from galaxy.datatypes.binary import Binary
13 from galaxy.datatypes.tabular import Tabular
14 from galaxy.datatypes.interval import Gff
11 15
12 log = logging.getLogger(__name__) 16 log = logging.getLogger(__name__)
13 17
14 class ProtGff( Gff ): 18
15 """Tab delimited data in Gff format""" 19 class Wiff( Binary ):
16 file_ext = "prot_gff" 20 """Class for wiff files."""
17 def set_peek( self, dataset, is_multi_byte=False ): 21 file_ext = 'wiff'
18 """Set the peek and blurb text""" 22 allow_datatype_change = False
19 if not dataset.dataset.purged: 23 composite_type = 'auto_primary_file'
20 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) 24
21 dataset.blurb = 'Proteogenomics GFF' 25 def __init__(self, **kwd):
22 else: 26 Binary.__init__(self, **kwd)
23 dataset.peek = 'file does not exist' 27 self.add_composite_file( 'wiff',
24 dataset.blurb = 'file purged from disk' 28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
25 29 is_binary = True )
26 def sniff( self, filename ): 30 self.add_composite_file( 'wiff_scan',
27 handle = open(filename) 31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
28 xmlns_re = re.compile("^##gff-version") 32 optional = 'True', is_binary = True )
29 for i in range(3): 33
30 line = handle.readline() 34 def generate_primary_file( self, dataset = None ):
31 if xmlns_re.match(line.strip()): 35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
32 handle.close() 36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
33 return True 37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
34 38 fn = composite_name
35 handle.close() 39 opt_text = ''
36 return False 40 if composite_file.optional:
37 41 opt_text = ' (optional)'
38 42 if composite_file.get('description'):
39 class Xls( Binary ): 43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
40 """Class describing a binary excel spreadsheet file""" 44 else:
41 file_ext = "xls" 45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
42 46 rval.append( '</ul></div></html>' )
43 def set_peek( self, dataset, is_multi_byte=False ): 47 return "\n".join( rval )
44 if not dataset.dataset.purged: 48
45 dataset.peek = "Excel Spreadsheet file" 49
46 dataset.blurb = data.nice_size( dataset.get_size() ) 50
47 else: 51 if hasattr(Binary, 'register_unsniffable_binary_ext'):
48 dataset.peek = 'file does not exist' 52 Binary.register_unsniffable_binary_ext('wiff')
49 dataset.blurb = 'file purged from disk' 53
54
55 class IdpDB( Binary ):
56 file_ext = "idpDB"
57
58 if hasattr(Binary, 'register_unsniffable_binary_ext'):
59 Binary.register_unsniffable_binary_ext('idpDB')
60
61
62 class PepXmlReport( Tabular ):
63 """pepxml converted to tabular report"""
64 file_ext = "tsv"
65
66 def __init__(self, **kwd):
67 Tabular.__init__( self, **kwd )
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
69
50 def display_peek( self, dataset ): 70 def display_peek( self, dataset ):
51 try: 71 """Returns formated html of peek"""
52 return dataset.peek 72 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
53 except: 73
54 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) 74
55 75 class ProtXmlReport( Tabular ):
56 class ProteomicsXml(GenericXml): 76 """protxml converted to tabular report"""
77 file_ext = "tsv"
78 comment_lines = 1
79
80 def __init__(self, **kwd):
81 Tabular.__init__( self, **kwd )
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
83
84 def display_peek( self, dataset ):
85 """Returns formated html of peek"""
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
87
88 class ProteomicsXml( GenericXml ):
57 """ An enhanced XML datatype used to reuse code across several 89 """ An enhanced XML datatype used to reuse code across several
58 proteomic/mass-spec datatypes. """ 90 proteomic/mass-spec datatypes. """
59 91
60 def sniff(self, filename): 92 def sniff(self, filename):
61 """ Determines whether the file is the correct XML type. """ 93 """ Determines whether the file is the correct XML type. """
62 with open(filename, 'r') as contents: 94 with open(filename, 'r') as contents:
63 while True: 95 while True:
64 line = contents.readline() 96 line = contents.readline()
65 if line == None or not line.startswith('<?'): 97 if line == None or not line.startswith('<?'):
66 break 98 break
67 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string 99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
74 dataset.blurb = self.blurb 106 dataset.blurb = self.blurb
75 else: 107 else:
76 dataset.peek = 'file does not exist' 108 dataset.peek = 'file does not exist'
77 dataset.blurb = 'file purged from disk' 109 dataset.blurb = 'file purged from disk'
78 110
111
79 class PepXml(ProteomicsXml): 112 class PepXml(ProteomicsXml):
80 """pepXML data""" 113 """pepXML data"""
81 file_ext = "pepxml" 114 file_ext = "pepxml"
82 blurb = 'pepXML data' 115 blurb = 'pepXML data'
83 root = "msms_pipeline_analysis" 116 root = "msms_pipeline_analysis"
84 117
85 118
86 class MzML(ProteomicsXml): 119 class MzML(ProteomicsXml):
87 """mzML data""" 120 """mzML data"""
88 file_ext = "mzml" 121 file_ext = "mzml"
89 blurb = 'mzML Mass Spectrometry data' 122 blurb = 'mzML Mass Spectrometry data'
97 root = "protein_summary" 130 root = "protein_summary"
98 131
99 132
100 class MzXML(ProteomicsXml): 133 class MzXML(ProteomicsXml):
101 """mzXML data""" 134 """mzXML data"""
102 file_ext = "mzXML" 135 file_ext = "mzxml"
103 blurb = "mzXML Mass Spectrometry data" 136 blurb = "mzXML Mass Spectrometry data"
104 root = "mzXML" 137 root = "mzXML"
105 138
106 ## PSI datatypes 139 ## PSI datatypes
107 class MzIdentML(ProteomicsXml): 140 class MzIdentML(ProteomicsXml):
108 file_ext = "mzid" 141 file_ext = "mzid"
109 blurb = "XML identified peptides and proteins." 142 blurb = "XML identified peptides and proteins."
110 root = "MzIdentML" 143 root = "MzIdentML"
111 144
112 145
113 class TraML(ProteomicsXml): 146 class TraML(ProteomicsXml):
114 file_ext = "traML" 147 file_ext = "traml"
115 blurb = "TraML transition list" 148 blurb = "TraML transition list"
116 root = "TraML" 149 root = "TraML"
117 150
118 151
119 class MzQuantML(ProteomicsXml): 152 class MzQuantML(ProteomicsXml):
120 file_ext = "mzq" 153 file_ext = "mzq"
121 blurb = "XML quantification data" 154 blurb = "XML quantification data"
122 root = "MzQuantML" 155 root = "MzQuantML"
123 156
124 157
158 class ConsensusXML(ProteomicsXml):
159 file_ext = "consensusxml"
160 blurb = "OpenMS multiple LC-MS map alignment file"
161 root = "consensusXML"
162
163
164 class FeatureXML(ProteomicsXml):
165 file_ext = "featurexml"
166 blurb = "OpenMS feature file"
167 root = "featureMap"
168
169
170 class IdXML(ProteomicsXml):
171 file_ext = "idxml"
172 blurb = "OpenMS identification file"
173 root = "IdXML"
174
175
125 class Mgf( Text ): 176 class Mgf( Text ):
126 """Mascot Generic Format data""" 177 """Mascot Generic Format data"""
127 file_ext = "mgf" 178 file_ext = "mgf"
128 179
129 def set_peek( self, dataset, is_multi_byte=False ): 180 def set_peek( self, dataset, is_multi_byte=False ):
133 dataset.blurb = 'mgf Mascot Generic Format' 184 dataset.blurb = 'mgf Mascot Generic Format'
134 else: 185 else:
135 dataset.peek = 'file does not exist' 186 dataset.peek = 'file does not exist'
136 dataset.blurb = 'file purged from disk' 187 dataset.blurb = 'file purged from disk'
137 188
138
139 def sniff( self, filename ): 189 def sniff( self, filename ):
140 mgf_begin_ions = "BEGIN IONS" 190 mgf_begin_ions = "BEGIN IONS"
141 max_lines=100 191 max_lines=100
142 192
143 for i, line in enumerate( file( filename ) ): 193 for i, line in enumerate( file( filename ) ):
144 line = line.rstrip( '\n\r' ) 194 line = line.rstrip( '\n\r' )
145 if line==mgf_begin_ions: 195 if line==mgf_begin_ions:
146 return True 196 return True
147 if i>max_lines: 197 if i>max_lines:
148 return False 198 return False
149 199
150 200
151 class MascotDat( Text ): 201 class MascotDat( Text ):
152 """Mascot search results """ 202 """Mascot search results """
153 file_ext = "mascotdat" 203 file_ext = "mascotdat"
154 204
155 def set_peek( self, dataset, is_multi_byte=False ): 205 def set_peek( self, dataset, is_multi_byte=False ):
204 except: 254 except:
205 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) 255 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
206 256
207 257
208 if hasattr(Binary, 'register_sniffable_binary_format'): 258 if hasattr(Binary, 'register_sniffable_binary_format'):
209 Binary.register_sniffable_binary_format('RAW', 'RAW', RAW) 259 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
210 260
211 261
212 class Msp(Text): 262 class Msp( Text ):
213 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ 263 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
214 file_ext = "msp" 264 file_ext = "msp"
215 265
216 @staticmethod 266 @staticmethod
217 def next_line_starts_with(contents, prefix): 267 def next_line_starts_with(contents, prefix):
265 return False 315 return False
266 316
267 return True 317 return True
268 318
269 # unsniffable binary format, should do something about this 319 # unsniffable binary format, should do something about this
270 class XHunterAslFormat(Binary): 320 class XHunterAslFormat( Binary ):
271 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ 321 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
272 file_ext = "hlf" 322 file_ext = "hlf"
273 323
274
275 if hasattr(Binary, 'register_unsniffable_binary_ext'): 324 if hasattr(Binary, 'register_unsniffable_binary_ext'):
276 Binary.register_unsniffable_binary_ext('hlf') 325 Binary.register_unsniffable_binary_ext('hlf')