comparison blast.py @ 14:623a3fbe5340 draft

planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/datatypes/blast_datatypes/ commit 3f6b5c953d522a724bbcd403bcb86f1e2757a556-dirty
author peterjc
date Fri, 03 Feb 2017 12:34:03 -0500
parents da92fef90117
children 310ec0f47485
comparison
equal deleted inserted replaced
13:e96b515847dd 14:623a3fbe5340
1 """NCBI BLAST datatypes.
2
3 Covers the ``blastxml`` format and the BLAST databases.
1 """ 4 """
2 BlastXml class 5
3 """ 6 import logging
7 import os
8 from time import sleep
4 9
5 from galaxy.datatypes.data import get_file_peek 10 from galaxy.datatypes.data import get_file_peek
6 from galaxy.datatypes.data import Text, Data, GenericAsn1 11 from galaxy.datatypes.data import Data, Text
7 from galaxy.datatypes.xml import GenericXml 12 from galaxy.datatypes.xml import GenericXml
8 from galaxy.datatypes.metadata import MetadataElement
9
10 from time import sleep
11 import os
12 import logging
13 13
14 log = logging.getLogger(__name__) 14 log = logging.getLogger(__name__)
15 15
16 class BlastXml( GenericXml ): 16
17 class BlastXml(GenericXml):
17 """NCBI Blast XML Output data""" 18 """NCBI Blast XML Output data"""
18 file_ext = "blastxml" 19 file_ext = "blastxml"
19 20
20 def set_peek( self, dataset, is_multi_byte=False ): 21 def set_peek(self, dataset, is_multi_byte=False):
21 """Set the peek and blurb text""" 22 """Set the peek and blurb text"""
22 if not dataset.dataset.purged: 23 if not dataset.dataset.purged:
23 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) 24 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
24 dataset.blurb = 'NCBI Blast XML data' 25 dataset.blurb = 'NCBI Blast XML data'
25 else: 26 else:
26 dataset.peek = 'file does not exist' 27 dataset.peek = 'file does not exist'
27 dataset.blurb = 'file purged from disk' 28 dataset.blurb = 'file purged from disk'
28 29
29 def sniff( self, filename ): 30 def sniff(self, filename):
30 """ 31 """Determines whether the file is blastxml
31 Determines whether the file is blastxml 32
32 33 >>> from galaxy.datatypes.sniff import get_test_fname
33 >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' ) 34 >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
34 >>> BlastXml().sniff( fname ) 35 >>> BlastXml().sniff(fname)
35 True 36 True
36 >>> fname = get_test_fname( 'tblastn_four_human_vs_rhodopsin.xml' ) 37 >>> fname = get_test_fname('tblastn_four_human_vs_rhodopsin.xml')
37 >>> BlastXml().sniff( fname ) 38 >>> BlastXml().sniff(fname)
38 True 39 True
39 >>> fname = get_test_fname( 'interval.interval' ) 40 >>> fname = get_test_fname('interval.interval')
40 >>> BlastXml().sniff( fname ) 41 >>> BlastXml().sniff(fname)
41 False 42 False
42 """ 43 """
43 #TODO - Use a context manager on Python 2.5+ to close handle 44 # TODO - Use a context manager on Python 2.5+ to close handle
44 handle = open(filename) 45 handle = open(filename)
45 line = handle.readline() 46 line = handle.readline()
46 if line.strip() != '<?xml version="1.0"?>': 47 if line.strip() != '<?xml version="1.0"?>':
47 handle.close() 48 handle.close()
48 return False 49 return False
55 if line.strip() != '<BlastOutput>': 56 if line.strip() != '<BlastOutput>':
56 handle.close() 57 handle.close()
57 return False 58 return False
58 handle.close() 59 handle.close()
59 return True 60 return True
60 61
61 def merge(split_files, output_file): 62 def merge(split_files, output_file):
62 """Merging multiple XML files is non-trivial and must be done in subclasses.""" 63 """Merging multiple XML files is non-trivial and must be done in subclasses."""
63 if len(split_files) == 1: 64 if len(split_files) == 1:
64 #For one file only, use base class method (move/copy) 65 # For one file only, use base class method (move/copy)
65 return Text.merge(split_files, output_file) 66 return Text.merge(split_files, output_file)
66 if not split_files: 67 if not split_files:
67 raise ValueError("Given no BLAST XML files, %r, to merge into %s" \ 68 raise ValueError("Given no BLAST XML files, %r, to merge into %s"
68 % (split_files, output_file)) 69 % (split_files, output_file))
69 out = open(output_file, "w") 70 out = open(output_file, "w")
70 h = None 71 h = None
71 for f in split_files: 72 for f in split_files:
72 if not os.path.isfile(f): 73 if not os.path.isfile(f):
78 h = open(f) 79 h = open(f)
79 header = h.readline() 80 header = h.readline()
80 if not header: 81 if not header:
81 out.close() 82 out.close()
82 h.close() 83 h.close()
83 #Retry, could be transient error with networked file system... 84 # Retry, could be transient error with networked file system...
84 log.warning("BLAST XML file %s empty, retry in 1s..." % f) 85 log.warning("BLAST XML file %s empty, retry in 1s..." % f)
85 sleep(1) 86 sleep(1)
86 h = open(f) 87 h = open(f)
87 header = h.readline() 88 header = h.readline()
88 if not header: 89 if not header:
89 log.error("BLAST XML file %s was empty" % f) 90 log.error("BLAST XML file %s was empty" % f)
90 raise ValueError("BLAST XML file %s was empty" % f) 91 raise ValueError("BLAST XML file %s was empty" % f)
91 if header.strip() != '<?xml version="1.0"?>': 92 if header.strip() != '<?xml version="1.0"?>':
92 out.write(header) #for diagnosis 93 out.write(header) # for diagnosis
93 out.close() 94 out.close()
94 h.close() 95 h.close()
95 raise ValueError("%s is not an XML file!" % f) 96 raise ValueError("%s is not an XML file!" % f)
96 line = h.readline() 97 line = h.readline()
97 header += line 98 header += line
98 if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', 99 if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
99 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']: 100 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
100 out.write(header) #for diagnosis 101 out.write(header) # for diagnosis
101 out.close() 102 out.close()
102 h.close() 103 h.close()
103 raise ValueError("%s is not a BLAST XML file!" % f) 104 raise ValueError("%s is not a BLAST XML file!" % f)
104 while True: 105 while True:
105 line = h.readline() 106 line = h.readline()
106 if not line: 107 if not line:
107 out.write(header) #for diagnosis 108 out.write(header) # for diagnosis
108 out.close() 109 out.close()
109 h.close() 110 h.close()
110 raise ValueError("BLAST XML file %s ended prematurely" % f) 111 raise ValueError("BLAST XML file %s ended prematurely" % f)
111 header += line 112 header += line
112 if "<Iteration>" in line: 113 if "<Iteration>" in line:
113 break 114 break
114 if len(header) > 10000: 115 if len(header) > 10000:
115 #Something has gone wrong, don't load too much into memory! 116 # Something has gone wrong, don't load too much into memory!
116 #Write what we have to the merged file for diagnostics 117 # Write what we have to the merged file for diagnostics
117 out.write(header) 118 out.write(header)
118 out.close() 119 out.close()
119 h.close() 120 h.close()
120 raise ValueError("BLAST XML file %s has too long a header!" % f) 121 raise ValueError("The header in BLAST XML file %s is too long" % f)
121 if "<BlastOutput>" not in header: 122 if "<BlastOutput>" not in header:
122 out.close() 123 out.close()
123 h.close() 124 h.close()
124 raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header)) 125 raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header))
125 if f == split_files[0]: 126 if f == split_files[0]:
126 out.write(header) 127 out.write(header)
127 old_header = header 128 old_header = header
128 elif old_header[:300] != header[:300]: 129 elif old_header[:300] != header[:300]:
129 #Enough to check <BlastOutput_program> and <BlastOutput_version> match 130 # Enough to check <BlastOutput_program> and <BlastOutput_version> match
130 out.close() 131 out.close()
131 h.close() 132 h.close()
132 raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \ 133 raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n"
133 % (split_files[0], f, old_header[:300], header[:300])) 134 % (split_files[0], f, old_header[:300], header[:300]))
134 else: 135 else:
135 out.write(" <Iteration>\n") 136 out.write(" <Iteration>\n")
136 for line in h: 137 for line in h:
137 if "</BlastOutput_iterations>" in line: 138 if "</BlastOutput_iterations>" in line:
138 break 139 break
139 #TODO - Increment <Iteration_iter-num> and if required automatic query names 140 # TODO - Increment <Iteration_iter-num> and if required automatic query names
140 #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing? 141 # like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing?
141 out.write(line) 142 out.write(line)
142 h.close() 143 h.close()
143 out.write(" </BlastOutput_iterations>\n") 144 out.write(" </BlastOutput_iterations>\n")
144 out.write("</BlastOutput>\n") 145 out.write("</BlastOutput>\n")
145 out.close() 146 out.close()
147 148
148 149
149 class _BlastDb(object): 150 class _BlastDb(object):
150 """Base class for BLAST database datatype.""" 151 """Base class for BLAST database datatype."""
151 152
152 def set_peek( self, dataset, is_multi_byte=False ): 153 def set_peek(self, dataset, is_multi_byte=False):
153 """Set the peek and blurb text.""" 154 """Set the peek and blurb text."""
154 if not dataset.dataset.purged: 155 if not dataset.dataset.purged:
155 dataset.peek = "BLAST database (multiple files)" 156 dataset.peek = "BLAST database (multiple files)"
156 dataset.blurb = "BLAST database (multiple files)" 157 dataset.blurb = "BLAST database (multiple files)"
157 else: 158 else:
158 dataset.peek = 'file does not exist' 159 dataset.peek = 'file does not exist'
159 dataset.blurb = 'file purged from disk' 160 dataset.blurb = 'file purged from disk'
160 161
161 def display_peek( self, dataset ): 162 def display_peek(self, dataset):
162 """Create HTML content, used for displaying peek.""" 163 """Create HTML content, used for displaying peek."""
163 try: 164 try:
164 return dataset.peek 165 return dataset.peek
165 except: 166 except Exception:
166 return "BLAST database (multiple files)" 167 return "BLAST database (multiple files)"
167 168
168 def display_data(self, trans, data, preview=False, filename=None, 169 def display_data(self, trans, data, preview=False, filename=None,
169 to_ext=None, size=None, offset=None, **kwd): 170 to_ext=None, size=None, offset=None, **kwd):
170 """Documented as an old display method, but still gets called via tests etc 171 """Documented as an old display method, but still gets called via tests etc
171 172
172 This allows us to format the data shown in the central pane via the "eye" icon. 173 This allows us to format the data shown in the central pane via the "eye" icon.
173 """ 174 """
174 if filename is not None and filename != "index": 175 if filename is not None and filename != "index":
175 #Change nothing - important for the unit tests to access child files: 176 # Change nothing - important for the unit tests to access child files:
176 return Data.display_data(self, trans, data, preview, filename, 177 return Data.display_data(self, trans, data, preview, filename,
177 to_ext, size, offset, **kwd) 178 to_ext, size, offset, **kwd)
178 if self.file_ext == "blastdbn": 179 if self.file_ext == "blastdbn":
179 title = "This is a nucleotide BLAST database" 180 title = "This is a nucleotide BLAST database"
180 elif self.file_ext =="blastdbp": 181 elif self.file_ext == "blastdbp":
181 title = "This is a protein BLAST database" 182 title = "This is a protein BLAST database"
182 elif self.file_ext =="blastdbd": 183 elif self.file_ext == "blastdbd":
183 title = "This is a domain BLAST database" 184 title = "This is a domain BLAST database"
184 else: 185 else:
185 #Error? 186 # Error?
186 title = "This is a BLAST database." 187 title = "This is a BLAST database."
187 msg = "" 188 msg = ""
188 try: 189 try:
189 #Try to use any text recorded in the dummy index file: 190 # Try to use any text recorded in the dummy index file:
190 handle = open(data.file_name, "rU") 191 handle = open(data.file_name, "rU")
191 msg = handle.read().strip() 192 msg = handle.read().strip()
192 handle.close() 193 handle.close()
193 except Exception, err: 194 except Exception:
194 #msg = str(err)
195 pass 195 pass
196 if not msg: 196 if not msg:
197 msg = title 197 msg = title
198 #Galaxy assumes HTML for the display of composite datatypes, 198 # Galaxy assumes HTML for the display of composite datatypes,
199 return "<html><head><title>%s</title></head><body><pre>%s</pre></body></html>" % (title, msg) 199 return "<html><head><title>%s</title></head><body><pre>%s</pre></body></html>" % (title, msg)
200 200
201 def merge(split_files, output_file): 201 def merge(split_files, output_file):
202 """Merge BLAST databases (not implemented for now).""" 202 """Merge BLAST databases (not implemented for now)."""
203 raise NotImplementedError("Merging BLAST databases is non-trivial (do this via makeblastdb?)") 203 raise NotImplementedError("Merging BLAST databases is non-trivial (do this via makeblastdb?)")
204 204
205 def split( cls, input_datasets, subdir_generator_function, split_params): 205 def split(cls, input_datasets, subdir_generator_function, split_params):
206 """Split a BLAST database (not implemented for now).""" 206 """Split a BLAST database (not implemented for now)."""
207 if split_params is None: 207 if split_params is None:
208 return None 208 return None
209 raise NotImplementedError("Can't split BLAST databases") 209 raise NotImplementedError("Can't split BLAST databases")
210 210
211 211
212 class BlastNucDb( _BlastDb, Data ): 212 class BlastNucDb(_BlastDb, Data):
213 """Class for nucleotide BLAST database files.""" 213 """Class for nucleotide BLAST database files."""
214 file_ext = 'blastdbn' 214 file_ext = 'blastdbn'
215 allow_datatype_change = False 215 allow_datatype_change = False
216 composite_type = 'basic' 216 composite_type = 'basic'
217 217
218 def __init__(self, **kwd): 218 def __init__(self, **kwd):
219 Data.__init__(self, **kwd) 219 Data.__init__(self, **kwd)
220 self.add_composite_file('blastdb.nhr', is_binary=True) # sequence headers 220 self.add_composite_file('blastdb.nhr', is_binary=True) # sequence headers
221 self.add_composite_file('blastdb.nin', is_binary=True) # index file 221 self.add_composite_file('blastdb.nin', is_binary=True) # index file
222 self.add_composite_file('blastdb.nsq', is_binary=True) # nucleotide sequences 222 self.add_composite_file('blastdb.nsq', is_binary=True) # nucleotide sequences
223 self.add_composite_file('blastdb.nal', is_binary=False, optional=True) # alias ( -gi_mask option of makeblastdb) 223 self.add_composite_file('blastdb.nal', is_binary=False, optional=True) # alias ( -gi_mask option of makeblastdb)
224 self.add_composite_file('blastdb.nhd', is_binary=True, optional=True) # sorted sequence hash values ( -hash_index option of makeblastdb) 224 self.add_composite_file('blastdb.nhd', is_binary=True, optional=True) # sorted sequence hash values ( -hash_index option of makeblastdb)
225 self.add_composite_file('blastdb.nhi', is_binary=True, optional=True) # index of sequence hash values ( -hash_index option of makeblastdb) 225 self.add_composite_file('blastdb.nhi', is_binary=True, optional=True) # index of sequence hash values ( -hash_index option of makeblastdb)
226 self.add_composite_file('blastdb.nnd', is_binary=True, optional=True) # sorted GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) 226 self.add_composite_file('blastdb.nnd', is_binary=True, optional=True) # sorted GI values ( -parse_seqids option of makeblastdb and gi present in the description lines)
227 self.add_composite_file('blastdb.nni', is_binary=True, optional=True) # index of GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) 227 self.add_composite_file('blastdb.nni', is_binary=True, optional=True) # index of GI values ( -parse_seqids option of makeblastdb and gi present in the description lines)
228 self.add_composite_file('blastdb.nog', is_binary=True, optional=True) # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb) 228 self.add_composite_file('blastdb.nog', is_binary=True, optional=True) # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb)
229 self.add_composite_file('blastdb.nsd', is_binary=True, optional=True) # sorted sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) 229 self.add_composite_file('blastdb.nsd', is_binary=True, optional=True) # sorted sequence accession values ( -hash_index or -parse_seqids option of makeblastdb)
230 self.add_composite_file('blastdb.nsi', is_binary=True, optional=True) # index of sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) 230 self.add_composite_file('blastdb.nsi', is_binary=True, optional=True) # index of sequence accession values ( -hash_index or -parse_seqids option of makeblastdb)
231 # self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True) # first volume of the MegaBLAST index generated by makembindex 231 # self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True) # first volume of the MegaBLAST index generated by makembindex
232 # The previous line should be repeated for each index volume, with filename extensions like '.01.idx', '.02.idx', etc. 232 # The previous line should be repeated for each index volume, with filename extensions like '.01.idx', '.02.idx', etc.
233 self.add_composite_file('blastdb.shd', is_binary=True, optional=True) # MegaBLAST index superheader (-old_style_index false option of makembindex) 233 self.add_composite_file('blastdb.shd', is_binary=True, optional=True) # MegaBLAST index superheader (-old_style_index false option of makembindex)
234 # self.add_composite_file('blastdb.naa', is_binary=True, optional=True) # index of a WriteDB column for e.g. mask data 234 # self.add_composite_file('blastdb.naa', is_binary=True, optional=True) # index of a WriteDB column for e.g. mask data
235 # self.add_composite_file('blastdb.nab', is_binary=True, optional=True) # data of a WriteDB column 235 # self.add_composite_file('blastdb.nab', is_binary=True, optional=True) # data of a WriteDB column
236 # self.add_composite_file('blastdb.nac', is_binary=True, optional=True) # multiple byte order for a WriteDB column 236 # self.add_composite_file('blastdb.nac', is_binary=True, optional=True) # multiple byte order for a WriteDB column
237 # The previous 3 lines should be repeated for each WriteDB column, with filename extensions like ('.nba', '.nbb', '.nbc'), ('.nca', '.ncb', '.ncc'), etc. 237 # The previous 3 lines should be repeated for each WriteDB column, with filename extensions like ('.nba', '.nbb', '.nbc'), ('.nca', '.ncb', '.ncc'), etc.
238 238
239 239
240 class BlastProtDb( _BlastDb, Data ): 240 class BlastProtDb(_BlastDb, Data):
241 """Class for protein BLAST database files.""" 241 """Class for protein BLAST database files."""
242 file_ext = 'blastdbp' 242 file_ext = 'blastdbp'
243 allow_datatype_change = False 243 allow_datatype_change = False
244 composite_type = 'basic' 244 composite_type = 'basic'
245 245
246 def __init__(self, **kwd): 246 def __init__(self, **kwd):
247 Data.__init__(self, **kwd) 247 Data.__init__(self, **kwd)
248 # Component file comments are as in BlastNucDb except where noted 248 # Component file comments are as in BlastNucDb except where noted
249 self.add_composite_file('blastdb.phr', is_binary=True) 249 self.add_composite_file('blastdb.phr', is_binary=True)
250 self.add_composite_file('blastdb.pin', is_binary=True) 250 self.add_composite_file('blastdb.pin', is_binary=True)
251 self.add_composite_file('blastdb.psq', is_binary=True) # protein sequences 251 self.add_composite_file('blastdb.psq', is_binary=True) # protein sequences
252 self.add_composite_file('blastdb.phd', is_binary=True, optional=True) 252 self.add_composite_file('blastdb.phd', is_binary=True, optional=True)
253 self.add_composite_file('blastdb.phi', is_binary=True, optional=True) 253 self.add_composite_file('blastdb.phi', is_binary=True, optional=True)
254 self.add_composite_file('blastdb.pnd', is_binary=True, optional=True) 254 self.add_composite_file('blastdb.pnd', is_binary=True, optional=True)
255 self.add_composite_file('blastdb.pni', is_binary=True, optional=True) 255 self.add_composite_file('blastdb.pni', is_binary=True, optional=True)
256 self.add_composite_file('blastdb.pog', is_binary=True, optional=True) 256 self.add_composite_file('blastdb.pog', is_binary=True, optional=True)
260 # self.add_composite_file('blastdb.pab', is_binary=True, optional=True) 260 # self.add_composite_file('blastdb.pab', is_binary=True, optional=True)
261 # self.add_composite_file('blastdb.pac', is_binary=True, optional=True) 261 # self.add_composite_file('blastdb.pac', is_binary=True, optional=True)
262 # The last 3 lines should be repeated for each WriteDB column, with filename extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc. 262 # The last 3 lines should be repeated for each WriteDB column, with filename extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc.
263 263
264 264
265 class BlastDomainDb( _BlastDb, Data ): 265 class BlastDomainDb(_BlastDb, Data):
266 """Class for domain BLAST database files.""" 266 """Class for domain BLAST database files."""
267 file_ext = 'blastdbd' 267 file_ext = 'blastdbd'
268 allow_datatype_change = False 268 allow_datatype_change = False
269 composite_type = 'basic' 269 composite_type = 'basic'
270 270