Mercurial > repos > iracooke > proteomics_datatypes
comparison proteomics.py @ 3:b22ebbb05260 draft
Uploaded
author | iracooke |
---|---|
date | Mon, 10 Mar 2014 19:40:20 -0400 |
parents | 7101f7e4b00b |
children |
comparison
equal
deleted
inserted
replaced
2:7edcae695986 | 3:b22ebbb05260 |
---|---|
1 """ | 1 """ |
2 Proteomics format classes | 2 Proteomics format classes |
3 """ | 3 """ |
4 import logging | 4 import logging |
5 import re | 5 import re |
6 from galaxy.datatypes.data import * | 6 import binascii |
7 from galaxy.datatypes.xml import * | 7 |
8 from galaxy.datatypes.sniff import * | 8 from galaxy.datatypes.sniff import * |
9 from galaxy.datatypes.binary import * | 9 from galaxy.datatypes import data |
10 from galaxy.datatypes.interval import * | 10 from galaxy.datatypes.data import Text |
11 from galaxy.datatypes.xml import GenericXml | |
12 from galaxy.datatypes.binary import Binary | |
13 from galaxy.datatypes.tabular import Tabular | |
14 from galaxy.datatypes.interval import Gff | |
11 | 15 |
12 log = logging.getLogger(__name__) | 16 log = logging.getLogger(__name__) |
13 | 17 |
14 class ProtGff( Gff ): | 18 |
15 """Tab delimited data in Gff format""" | 19 class Wiff( Binary ): |
16 file_ext = "prot_gff" | 20 """Class for wiff files.""" |
17 def set_peek( self, dataset, is_multi_byte=False ): | 21 file_ext = 'wiff' |
18 """Set the peek and blurb text""" | 22 allow_datatype_change = False |
19 if not dataset.dataset.purged: | 23 composite_type = 'auto_primary_file' |
20 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | 24 |
21 dataset.blurb = 'Proteogenomics GFF' | 25 def __init__(self, **kwd): |
22 else: | 26 Binary.__init__(self, **kwd) |
23 dataset.peek = 'file does not exist' | 27 self.add_composite_file( 'wiff', |
24 dataset.blurb = 'file purged from disk' | 28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.', |
25 | 29 is_binary = True ) |
26 def sniff( self, filename ): | 30 self.add_composite_file( 'wiff_scan', |
27 handle = open(filename) | 31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.', |
28 xmlns_re = re.compile("^##gff-version") | 32 optional = 'True', is_binary = True ) |
29 for i in range(3): | 33 |
30 line = handle.readline() | 34 def generate_primary_file( self, dataset = None ): |
31 if xmlns_re.match(line.strip()): | 35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>'] |
32 handle.close() | 36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>') |
33 return True | 37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems(): |
34 | 38 fn = composite_name |
35 handle.close() | 39 opt_text = '' |
36 return False | 40 if composite_file.optional: |
37 | 41 opt_text = ' (optional)' |
38 | 42 if composite_file.get('description'): |
39 class Xls( Binary ): | 43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) ) |
40 """Class describing a binary excel spreadsheet file""" | 44 else: |
41 file_ext = "xls" | 45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) ) |
42 | 46 rval.append( '</ul></div></html>' ) |
43 def set_peek( self, dataset, is_multi_byte=False ): | 47 return "\n".join( rval ) |
44 if not dataset.dataset.purged: | 48 |
45 dataset.peek = "Excel Spreadsheet file" | 49 |
46 dataset.blurb = data.nice_size( dataset.get_size() ) | 50 |
47 else: | 51 if hasattr(Binary, 'register_unsniffable_binary_ext'): |
48 dataset.peek = 'file does not exist' | 52 Binary.register_unsniffable_binary_ext('wiff') |
49 dataset.blurb = 'file purged from disk' | 53 |
54 | |
55 class IdpDB( Binary ): | |
56 file_ext = "idpDB" | |
57 | |
58 if hasattr(Binary, 'register_unsniffable_binary_ext'): | |
59 Binary.register_unsniffable_binary_ext('idpDB') | |
60 | |
61 | |
62 class PepXmlReport( Tabular ): | |
63 """pepxml converted to tabular report""" | |
64 file_ext = "tsv" | |
65 | |
66 def __init__(self, **kwd): | |
67 Tabular.__init__( self, **kwd ) | |
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility'] | |
69 | |
50 def display_peek( self, dataset ): | 70 def display_peek( self, dataset ): |
51 try: | 71 """Returns formated html of peek""" |
52 return dataset.peek | 72 return Tabular.make_html_table( self, dataset, column_names=self.column_names ) |
53 except: | 73 |
54 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) | 74 |
55 | 75 class ProtXmlReport( Tabular ): |
56 class ProteomicsXml(GenericXml): | 76 """protxml converted to tabular report""" |
77 file_ext = "tsv" | |
78 comment_lines = 1 | |
79 | |
80 def __init__(self, **kwd): | |
81 Tabular.__init__( self, **kwd ) | |
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"] | |
83 | |
84 def display_peek( self, dataset ): | |
85 """Returns formated html of peek""" | |
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names ) | |
87 | |
88 class ProteomicsXml( GenericXml ): | |
57 """ An enhanced XML datatype used to reuse code across several | 89 """ An enhanced XML datatype used to reuse code across several |
58 proteomic/mass-spec datatypes. """ | 90 proteomic/mass-spec datatypes. """ |
59 | 91 |
60 def sniff(self, filename): | 92 def sniff(self, filename): |
61 """ Determines whether the file is the correct XML type. """ | 93 """ Determines whether the file is the correct XML type. """ |
62 with open(filename, 'r') as contents: | 94 with open(filename, 'r') as contents: |
63 while True: | 95 while True: |
64 line = contents.readline() | 96 line = contents.readline() |
65 if line == None or not line.startswith('<?'): | 97 if line == None or not line.startswith('<?'): |
66 break | 98 break |
67 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string | 99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string |
74 dataset.blurb = self.blurb | 106 dataset.blurb = self.blurb |
75 else: | 107 else: |
76 dataset.peek = 'file does not exist' | 108 dataset.peek = 'file does not exist' |
77 dataset.blurb = 'file purged from disk' | 109 dataset.blurb = 'file purged from disk' |
78 | 110 |
111 | |
79 class PepXml(ProteomicsXml): | 112 class PepXml(ProteomicsXml): |
80 """pepXML data""" | 113 """pepXML data""" |
81 file_ext = "pepxml" | 114 file_ext = "pepxml" |
82 blurb = 'pepXML data' | 115 blurb = 'pepXML data' |
83 root = "msms_pipeline_analysis" | 116 root = "msms_pipeline_analysis" |
84 | 117 |
85 | 118 |
86 class MzML(ProteomicsXml): | 119 class MzML(ProteomicsXml): |
87 """mzML data""" | 120 """mzML data""" |
88 file_ext = "mzml" | 121 file_ext = "mzml" |
89 blurb = 'mzML Mass Spectrometry data' | 122 blurb = 'mzML Mass Spectrometry data' |
97 root = "protein_summary" | 130 root = "protein_summary" |
98 | 131 |
99 | 132 |
100 class MzXML(ProteomicsXml): | 133 class MzXML(ProteomicsXml): |
101 """mzXML data""" | 134 """mzXML data""" |
102 file_ext = "mzXML" | 135 file_ext = "mzxml" |
103 blurb = "mzXML Mass Spectrometry data" | 136 blurb = "mzXML Mass Spectrometry data" |
104 root = "mzXML" | 137 root = "mzXML" |
105 | 138 |
106 ## PSI datatypes | 139 ## PSI datatypes |
107 class MzIdentML(ProteomicsXml): | 140 class MzIdentML(ProteomicsXml): |
108 file_ext = "mzid" | 141 file_ext = "mzid" |
109 blurb = "XML identified peptides and proteins." | 142 blurb = "XML identified peptides and proteins." |
110 root = "MzIdentML" | 143 root = "MzIdentML" |
111 | 144 |
112 | 145 |
113 class TraML(ProteomicsXml): | 146 class TraML(ProteomicsXml): |
114 file_ext = "traML" | 147 file_ext = "traml" |
115 blurb = "TraML transition list" | 148 blurb = "TraML transition list" |
116 root = "TraML" | 149 root = "TraML" |
117 | 150 |
118 | 151 |
119 class MzQuantML(ProteomicsXml): | 152 class MzQuantML(ProteomicsXml): |
120 file_ext = "mzq" | 153 file_ext = "mzq" |
121 blurb = "XML quantification data" | 154 blurb = "XML quantification data" |
122 root = "MzQuantML" | 155 root = "MzQuantML" |
123 | 156 |
124 | 157 |
158 class ConsensusXML(ProteomicsXml): | |
159 file_ext = "consensusxml" | |
160 blurb = "OpenMS multiple LC-MS map alignment file" | |
161 root = "consensusXML" | |
162 | |
163 | |
164 class FeatureXML(ProteomicsXml): | |
165 file_ext = "featurexml" | |
166 blurb = "OpenMS feature file" | |
167 root = "featureMap" | |
168 | |
169 | |
170 class IdXML(ProteomicsXml): | |
171 file_ext = "idxml" | |
172 blurb = "OpenMS identification file" | |
173 root = "IdXML" | |
174 | |
175 | |
125 class Mgf( Text ): | 176 class Mgf( Text ): |
126 """Mascot Generic Format data""" | 177 """Mascot Generic Format data""" |
127 file_ext = "mgf" | 178 file_ext = "mgf" |
128 | 179 |
129 def set_peek( self, dataset, is_multi_byte=False ): | 180 def set_peek( self, dataset, is_multi_byte=False ): |
133 dataset.blurb = 'mgf Mascot Generic Format' | 184 dataset.blurb = 'mgf Mascot Generic Format' |
134 else: | 185 else: |
135 dataset.peek = 'file does not exist' | 186 dataset.peek = 'file does not exist' |
136 dataset.blurb = 'file purged from disk' | 187 dataset.blurb = 'file purged from disk' |
137 | 188 |
138 | |
139 def sniff( self, filename ): | 189 def sniff( self, filename ): |
140 mgf_begin_ions = "BEGIN IONS" | 190 mgf_begin_ions = "BEGIN IONS" |
141 max_lines=100 | 191 max_lines=100 |
142 | 192 |
143 for i, line in enumerate( file( filename ) ): | 193 for i, line in enumerate( file( filename ) ): |
144 line = line.rstrip( '\n\r' ) | 194 line = line.rstrip( '\n\r' ) |
145 if line==mgf_begin_ions: | 195 if line==mgf_begin_ions: |
146 return True | 196 return True |
147 if i>max_lines: | 197 if i>max_lines: |
148 return False | 198 return False |
149 | 199 |
150 | 200 |
151 class MascotDat( Text ): | 201 class MascotDat( Text ): |
152 """Mascot search results """ | 202 """Mascot search results """ |
153 file_ext = "mascotdat" | 203 file_ext = "mascotdat" |
154 | 204 |
155 def set_peek( self, dataset, is_multi_byte=False ): | 205 def set_peek( self, dataset, is_multi_byte=False ): |
204 except: | 254 except: |
205 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) | 255 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) |
206 | 256 |
207 | 257 |
208 if hasattr(Binary, 'register_sniffable_binary_format'): | 258 if hasattr(Binary, 'register_sniffable_binary_format'): |
209 Binary.register_sniffable_binary_format('RAW', 'RAW', RAW) | 259 Binary.register_sniffable_binary_format('raw', 'raw', RAW) |
210 | 260 |
211 | 261 |
212 class Msp(Text): | 262 class Msp( Text ): |
213 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ | 263 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ |
214 file_ext = "msp" | 264 file_ext = "msp" |
215 | 265 |
216 @staticmethod | 266 @staticmethod |
217 def next_line_starts_with(contents, prefix): | 267 def next_line_starts_with(contents, prefix): |
265 return False | 315 return False |
266 | 316 |
267 return True | 317 return True |
268 | 318 |
269 # unsniffable binary format, should do something about this | 319 # unsniffable binary format, should do something about this |
270 class XHunterAslFormat(Binary): | 320 class XHunterAslFormat( Binary ): |
271 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ | 321 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ |
272 file_ext = "hlf" | 322 file_ext = "hlf" |
273 | 323 |
274 | |
275 if hasattr(Binary, 'register_unsniffable_binary_ext'): | 324 if hasattr(Binary, 'register_unsniffable_binary_ext'): |
276 Binary.register_unsniffable_binary_ext('hlf') | 325 Binary.register_unsniffable_binary_ext('hlf') |