annotate proteomics.py @ 5:a1b29f86bdbf draft

Uploaded
author iracooke
date Fri, 09 May 2014 04:09:25 -0400
parents b22ebbb05260
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
1 """
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
2 Proteomics format classes
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
3 """
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
4 import logging
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
5 import re
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
6 import binascii
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
7
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
8 from galaxy.datatypes.sniff import *
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
9 from galaxy.datatypes import data
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
10 from galaxy.datatypes.data import Text
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
11 from galaxy.datatypes.xml import GenericXml
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
12 from galaxy.datatypes.binary import Binary
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
13 from galaxy.datatypes.tabular import Tabular
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
14 from galaxy.datatypes.interval import Gff
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
15
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
16 log = logging.getLogger(__name__)
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
17
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
18
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
19 class Wiff( Binary ):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
20 """Class for wiff files."""
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
21 file_ext = 'wiff'
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
22 allow_datatype_change = False
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
23 composite_type = 'auto_primary_file'
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
24
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
25 def __init__(self, **kwd):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
26 Binary.__init__(self, **kwd)
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
27 self.add_composite_file( 'wiff',
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
29 is_binary = True )
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
30 self.add_composite_file( 'wiff_scan',
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
32 optional = 'True', is_binary = True )
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
33
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
34 def generate_primary_file( self, dataset = None ):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
38 fn = composite_name
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
39 opt_text = ''
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
40 if composite_file.optional:
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
41 opt_text = ' (optional)'
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
42 if composite_file.get('description'):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
44 else:
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
46 rval.append( '</ul></div></html>' )
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
47 return "\n".join( rval )
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
48
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
49
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
50
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
51 if hasattr(Binary, 'register_unsniffable_binary_ext'):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
52 Binary.register_unsniffable_binary_ext('wiff')
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
53
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
54
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
55 class IdpDB( Binary ):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
56 file_ext = "idpDB"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
57
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
58 if hasattr(Binary, 'register_unsniffable_binary_ext'):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
59 Binary.register_unsniffable_binary_ext('idpDB')
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
60
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
61
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
62 class PepXmlReport( Tabular ):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
63 """pepxml converted to tabular report"""
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
64 file_ext = "tsv"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
65
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
66 def __init__(self, **kwd):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
67 Tabular.__init__( self, **kwd )
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
69
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
70 def display_peek( self, dataset ):
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
71 """Returns formated html of peek"""
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
73
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
74
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
75 class ProtXmlReport( Tabular ):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
76 """protxml converted to tabular report"""
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
77 file_ext = "tsv"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
78 comment_lines = 1
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
79
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
80 def __init__(self, **kwd):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
81 Tabular.__init__( self, **kwd )
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
83
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
84 def display_peek( self, dataset ):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
85 """Returns formated html of peek"""
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
87
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
88 class ProteomicsXml( GenericXml ):
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
89 """ An enhanced XML datatype used to reuse code across several
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
90 proteomic/mass-spec datatypes. """
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
91
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
92 def sniff(self, filename):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
93 """ Determines whether the file is the correct XML type. """
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
94 with open(filename, 'r') as contents:
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
95 while True:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
96 line = contents.readline()
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
97 if line == None or not line.startswith('<?'):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
98 break
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
100 return line != None and re.match(pattern, line) != None
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
101
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
102 def set_peek( self, dataset, is_multi_byte=False ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
103 """Set the peek and blurb text"""
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
104 if not dataset.dataset.purged:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
106 dataset.blurb = self.blurb
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
107 else:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
108 dataset.peek = 'file does not exist'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
109 dataset.blurb = 'file purged from disk'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
110
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
111
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
112 class PepXml(ProteomicsXml):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
113 """pepXML data"""
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
114 file_ext = "pepxml"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
115 blurb = 'pepXML data'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
116 root = "msms_pipeline_analysis"
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
117
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
118
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
119 class MzML(ProteomicsXml):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
120 """mzML data"""
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
121 file_ext = "mzml"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
122 blurb = 'mzML Mass Spectrometry data'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
123 root = "(mzML|indexedmzML)"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
124
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
125
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
126 class ProtXML(ProteomicsXml):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
127 """protXML data"""
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
128 file_ext = "protxml"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
129 blurb = 'prot XML Search Results'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
130 root = "protein_summary"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
131
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
132
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
133 class MzXML(ProteomicsXml):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
134 """mzXML data"""
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
135 file_ext = "mzxml"
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
136 blurb = "mzXML Mass Spectrometry data"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
137 root = "mzXML"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
138
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
139 ## PSI datatypes
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
140 class MzIdentML(ProteomicsXml):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
141 file_ext = "mzid"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
142 blurb = "XML identified peptides and proteins."
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
143 root = "MzIdentML"
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
144
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
145
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
146 class TraML(ProteomicsXml):
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
147 file_ext = "traml"
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
148 blurb = "TraML transition list"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
149 root = "TraML"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
150
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
151
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
152 class MzQuantML(ProteomicsXml):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
153 file_ext = "mzq"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
154 blurb = "XML quantification data"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
155 root = "MzQuantML"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
156
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
157
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
158 class ConsensusXML(ProteomicsXml):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
159 file_ext = "consensusxml"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
160 blurb = "OpenMS multiple LC-MS map alignment file"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
161 root = "consensusXML"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
162
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
163
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
164 class FeatureXML(ProteomicsXml):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
165 file_ext = "featurexml"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
166 blurb = "OpenMS feature file"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
167 root = "featureMap"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
168
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
169
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
170 class IdXML(ProteomicsXml):
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
171 file_ext = "idxml"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
172 blurb = "OpenMS identification file"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
173 root = "IdXML"
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
174
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
175
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
176 class Mgf( Text ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
177 """Mascot Generic Format data"""
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
178 file_ext = "mgf"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
179
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
180 def set_peek( self, dataset, is_multi_byte=False ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
181 """Set the peek and blurb text"""
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
182 if not dataset.dataset.purged:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
183 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
184 dataset.blurb = 'mgf Mascot Generic Format'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
185 else:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
186 dataset.peek = 'file does not exist'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
187 dataset.blurb = 'file purged from disk'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
188
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
189 def sniff( self, filename ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
190 mgf_begin_ions = "BEGIN IONS"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
191 max_lines=100
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
192
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
193 for i, line in enumerate( file( filename ) ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
194 line = line.rstrip( '\n\r' )
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
195 if line==mgf_begin_ions:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
196 return True
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
197 if i>max_lines:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
198 return False
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
199
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
200
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
201 class MascotDat( Text ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
202 """Mascot search results """
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
203 file_ext = "mascotdat"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
204
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
205 def set_peek( self, dataset, is_multi_byte=False ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
206 """Set the peek and blurb text"""
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
207 if not dataset.dataset.purged:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
208 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
209 dataset.blurb = 'mascotdat Mascot Search Results'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
210 else:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
211 dataset.peek = 'file does not exist'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
212 dataset.blurb = 'file purged from disk'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
213
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
214
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
215 def sniff( self, filename ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
216 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
217 max_lines=10
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
218
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
219 for i, line in enumerate( file( filename ) ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
220 line = line.rstrip( '\n\r' )
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
221 if line==mime_version:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
222 return True
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
223 if i>max_lines:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
224 return False
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
225
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
226
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
227 class RAW( Binary ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
228 """Class describing a Thermo Finnigan binary RAW file"""
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
229 file_ext = "raw"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
230 def sniff( self, filename ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
231 # Thermo Finnigan RAW format is proprietary and hence not well documented.
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
232 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
233 # This combination represents 17 bytes, but to play safe we read 20 bytes from
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
234 # the start of the file.
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
235 try:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
236 header = open( filename ).read(20)
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
237 hexheader = binascii.b2a_hex( header )
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
238 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
239 if hexheader.find(finnigan) != -1:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
240 return True
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
241 return False
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
242 except:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
243 return False
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
244 def set_peek( self, dataset, is_multi_byte=False ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
245 if not dataset.dataset.purged:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
246 dataset.peek = "Thermo Finnigan RAW file"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
247 dataset.blurb = data.nice_size( dataset.get_size() )
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
248 else:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
249 dataset.peek = 'file does not exist'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
250 dataset.blurb = 'file purged from disk'
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
251 def display_peek( self, dataset ):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
252 try:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
253 return dataset.peek
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
254 except:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
255 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
256
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
257
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
258 if hasattr(Binary, 'register_sniffable_binary_format'):
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
259 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
260
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
261
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
262 class Msp( Text ):
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
263 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
264 file_ext = "msp"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
265
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
266 @staticmethod
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
267 def next_line_starts_with(contents, prefix):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
268 next_line = contents.readline()
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
269 return next_line != None and next_line.startswith(prefix)
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
270
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
271 def sniff(self, filename):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
272 """ Determines whether the file is a NIST MSP output file.
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
273
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
274 >>> fname = get_test_fname('test.msp')
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
275 >>> Msp().sniff(fname)
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
276 True
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
277 >>> fname = get_test_fname('test.mzXML')
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
278 >>> Msp().sniff(fname)
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
279 False
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
280 """
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
281 with open(filename, 'r') as contents:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
282 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
283
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
284 class Ms2(Text):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
285 file_ext = "ms2"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
286
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
287 def sniff(self, filename):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
288 """ Determines whether the file is a valid ms2 file.
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
289
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
290 >>> fname = get_test_fname('test.msp')
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
291 >>> Ms2().sniff(fname)
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
292 False
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
293 >>> fname = get_test_fname('test.ms2')
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
294 >>> Ms2().sniff(fname)
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
295 True
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
296 """
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
297
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
298 with open(filename, 'r') as contents:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
299 header_lines = []
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
300 while True:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
301 line = contents.readline()
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
302 if line == None or len(line) == 0:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
303 pass
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
304 elif line.startswith('H\t'):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
305 header_lines.append(line)
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
306 else:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
307 break
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
308 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
309 found_header = False
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
310 for header_line in header_lines:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
311 if header_line.startswith('H\t%s' % (header_field)):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
312 found_header = True
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
313 break
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
314 if not found_header:
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
315 return False
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
316
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
317 return True
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
318
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
319 # unsniffable binary format, should do something about this
3
b22ebbb05260 Uploaded
iracooke
parents: 0
diff changeset
320 class XHunterAslFormat( Binary ):
0
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
321 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
322 file_ext = "hlf"
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
323
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
324 if hasattr(Binary, 'register_unsniffable_binary_ext'):
7101f7e4b00b Uploaded
iracooke
parents:
diff changeset
325 Binary.register_unsniffable_binary_ext('hlf')