annotate proteomics.py @ 7:9cfabf0b942d draft

Uploaded
author iracooke
date Sun, 14 Dec 2014 22:42:08 -0500
parents
children 6ab4a0bf67df
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
7
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
1 """
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
2 Proteomics format classes
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
3 """
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
4 import logging
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
5 import re
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
6 import binascii
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
7
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
8 from galaxy.datatypes.sniff import *
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
9 from galaxy.datatypes import data
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
10 from galaxy.datatypes.data import Text
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
11 from galaxy.datatypes.xml import GenericXml
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
12 from galaxy.datatypes.binary import Binary
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
13 from galaxy.datatypes.tabular import Tabular
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
14 from galaxy.datatypes.interval import Gff
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
15
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
16 log = logging.getLogger(__name__)
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
17
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
18
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
19 class Wiff( Binary ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
20 """Class for wiff files."""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
21 file_ext = 'wiff'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
22 allow_datatype_change = False
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
23 composite_type = 'auto_primary_file'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
24
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
25 def __init__(self, **kwd):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
26 Binary.__init__(self, **kwd)
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
27 self.add_composite_file( 'wiff',
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
29 is_binary = True )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
30 self.add_composite_file( 'wiff_scan',
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
32 optional = 'True', is_binary = True )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
33
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
34 def generate_primary_file( self, dataset = None ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
38 fn = composite_name
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
39 opt_text = ''
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
40 if composite_file.optional:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
41 opt_text = ' (optional)'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
42 if composite_file.get('description'):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
44 else:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
46 rval.append( '</ul></div></html>' )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
47 return "\n".join( rval )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
48
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
49
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
50
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
51 if hasattr(Binary, 'register_unsniffable_binary_ext'):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
52 Binary.register_unsniffable_binary_ext('wiff')
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
53
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
54
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
55 class IdpDB( Binary ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
56 file_ext = "idpDB"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
57
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
58 if hasattr(Binary, 'register_unsniffable_binary_ext'):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
59 Binary.register_unsniffable_binary_ext('idpDB')
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
60
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
61
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
62 class PepXmlReport( Tabular ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
63 """pepxml converted to tabular report"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
64 file_ext = "tsv"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
65
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
66 def __init__(self, **kwd):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
67 Tabular.__init__( self, **kwd )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
69
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
70 def display_peek( self, dataset ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
71 """Returns formated html of peek"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
73
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
74
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
75 class ProtXmlReport( Tabular ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
76 """protxml converted to tabular report"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
77 file_ext = "tsv"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
78 comment_lines = 1
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
79
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
80 def __init__(self, **kwd):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
81 Tabular.__init__( self, **kwd )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
83
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
84 def display_peek( self, dataset ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
85 """Returns formated html of peek"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
87
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
88 class ProteomicsXml( GenericXml ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
89 """ An enhanced XML datatype used to reuse code across several
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
90 proteomic/mass-spec datatypes. """
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
91
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
92 def sniff(self, filename):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
93 """ Determines whether the file is the correct XML type. """
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
94 with open(filename, 'r') as contents:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
95 while True:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
96 line = contents.readline()
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
97 if line == None or not line.startswith('<?'):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
98 break
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
100 return line != None and re.match(pattern, line) != None
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
101
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
102 def set_peek( self, dataset, is_multi_byte=False ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
103 """Set the peek and blurb text"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
104 if not dataset.dataset.purged:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
106 dataset.blurb = self.blurb
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
107 else:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
108 dataset.peek = 'file does not exist'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
109 dataset.blurb = 'file purged from disk'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
110
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
111
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
112 class PepXml(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
113 """pepXML data"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
114 file_ext = "pepxml"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
115 blurb = 'pepXML data'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
116 root = "msms_pipeline_analysis"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
117
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
118
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
119 class MzML(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
120 """mzML data"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
121 file_ext = "mzml"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
122 blurb = 'mzML Mass Spectrometry data'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
123 root = "(mzML|indexedmzML)"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
124
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
125
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
126 class ProtXML(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
127 """protXML data"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
128 file_ext = "protxml"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
129 blurb = 'prot XML Search Results'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
130 root = "protein_summary"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
131
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
132
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
133 class MzXML(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
134 """mzXML data"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
135 file_ext = "mzxml"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
136 blurb = "mzXML Mass Spectrometry data"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
137 root = "mzXML"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
138
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
139 ## PSI datatypes
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
140 class MzIdentML(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
141 file_ext = "mzid"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
142 blurb = "XML identified peptides and proteins."
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
143 root = "MzIdentML"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
144
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
145
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
146 class TraML(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
147 file_ext = "traml"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
148 blurb = "TraML transition list"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
149 root = "TraML"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
150
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
151
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
152 class MzQuantML(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
153 file_ext = "mzq"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
154 blurb = "XML quantification data"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
155 root = "MzQuantML"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
156
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
157
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
158 class ConsensusXML(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
159 file_ext = "consensusxml"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
160 blurb = "OpenMS multiple LC-MS map alignment file"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
161 root = "consensusXML"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
162
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
163
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
164 class FeatureXML(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
165 file_ext = "featurexml"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
166 blurb = "OpenMS feature file"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
167 root = "featureMap"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
168
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
169
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
170 class IdXML(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
171 file_ext = "idxml"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
172 blurb = "OpenMS identification file"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
173 root = "IdXML"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
174
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
175 class TandemXML(ProteomicsXml):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
176 file_ext = "tandem"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
177 blurb = "X!Tandem search results file"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
178 root = "bioml"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
179
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
180 class Mgf( Text ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
181 """Mascot Generic Format data"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
182 file_ext = "mgf"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
183
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
184 def set_peek( self, dataset, is_multi_byte=False ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
185 """Set the peek and blurb text"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
186 if not dataset.dataset.purged:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
188 dataset.blurb = 'mgf Mascot Generic Format'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
189 else:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
190 dataset.peek = 'file does not exist'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
191 dataset.blurb = 'file purged from disk'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
192
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
193 def sniff( self, filename ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
194 mgf_begin_ions = "BEGIN IONS"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
195 max_lines=100
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
196
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
197 for i, line in enumerate( file( filename ) ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
198 line = line.rstrip( '\n\r' )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
199 if line==mgf_begin_ions:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
200 return True
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
201 if i>max_lines:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
202 return False
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
203
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
204
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
205 class MascotDat( Text ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
206 """Mascot search results """
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
207 file_ext = "mascotdat"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
208
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
209 def set_peek( self, dataset, is_multi_byte=False ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
210 """Set the peek and blurb text"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
211 if not dataset.dataset.purged:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
212 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
213 dataset.blurb = 'mascotdat Mascot Search Results'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
214 else:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
215 dataset.peek = 'file does not exist'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
216 dataset.blurb = 'file purged from disk'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
217
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
218
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
219 def sniff( self, filename ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
220 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
221 max_lines=10
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
222
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
223 for i, line in enumerate( file( filename ) ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
224 line = line.rstrip( '\n\r' )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
225 if line==mime_version:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
226 return True
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
227 if i>max_lines:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
228 return False
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
229
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
230
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
231 class RAW( Binary ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
232 """Class describing a Thermo Finnigan binary RAW file"""
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
233 file_ext = "raw"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
234 def sniff( self, filename ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
235 # Thermo Finnigan RAW format is proprietary and hence not well documented.
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
236 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
237 # This combination represents 17 bytes, but to play safe we read 20 bytes from
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
238 # the start of the file.
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
239 try:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
240 header = open( filename ).read(20)
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
241 hexheader = binascii.b2a_hex( header )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
242 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
243 if hexheader.find(finnigan) != -1:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
244 return True
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
245 return False
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
246 except:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
247 return False
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
248 def set_peek( self, dataset, is_multi_byte=False ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
249 if not dataset.dataset.purged:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
250 dataset.peek = "Thermo Finnigan RAW file"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
251 dataset.blurb = data.nice_size( dataset.get_size() )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
252 else:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
253 dataset.peek = 'file does not exist'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
254 dataset.blurb = 'file purged from disk'
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
255 def display_peek( self, dataset ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
256 try:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
257 return dataset.peek
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
258 except:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
260
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
261
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
262 if hasattr(Binary, 'register_sniffable_binary_format'):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
263 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
264
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
265
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
266 class Msp( Text ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
268 file_ext = "msp"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
269
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
270 @staticmethod
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
271 def next_line_starts_with(contents, prefix):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
272 next_line = contents.readline()
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
273 return next_line != None and next_line.startswith(prefix)
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
274
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
275 def sniff(self, filename):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
276 """ Determines whether the file is a NIST MSP output file.
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
277
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
278 >>> fname = get_test_fname('test.msp')
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
279 >>> Msp().sniff(fname)
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
280 True
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
281 >>> fname = get_test_fname('test.mzXML')
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
282 >>> Msp().sniff(fname)
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
283 False
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
284 """
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
285 with open(filename, 'r') as contents:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
286 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
287
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
288 class Ms2(Text):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
289 file_ext = "ms2"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
290
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
291 def sniff(self, filename):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
292 """ Determines whether the file is a valid ms2 file.
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
293
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
294 >>> fname = get_test_fname('test.msp')
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
295 >>> Ms2().sniff(fname)
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
296 False
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
297 >>> fname = get_test_fname('test.ms2')
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
298 >>> Ms2().sniff(fname)
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
299 True
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
300 """
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
301
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
302 with open(filename, 'r') as contents:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
303 header_lines = []
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
304 while True:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
305 line = contents.readline()
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
306 if line == None or len(line) == 0:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
307 pass
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
308 elif line.startswith('H\t'):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
309 header_lines.append(line)
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
310 else:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
311 break
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
312 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
313 found_header = False
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
314 for header_line in header_lines:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
315 if header_line.startswith('H\t%s' % (header_field)):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
316 found_header = True
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
317 break
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
318 if not found_header:
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
319 return False
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
320
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
321 return True
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
322
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
323 # unsniffable binary format, should do something about this
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
324 class XHunterAslFormat( Binary ):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
325 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
326 file_ext = "hlf"
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
327
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
328 if hasattr(Binary, 'register_unsniffable_binary_ext'):
9cfabf0b942d Uploaded
iracooke
parents:
diff changeset
329 Binary.register_unsniffable_binary_ext('hlf')