Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Blast/HTML.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 #------------------------------------------------------------------------------- | |
2 # PACKAGE : Bio::Tools::Blast::HTML | |
3 # PURPOSE : To encapsulate code for HTML formatting BLAST reports. | |
4 # AUTHOR : Steve Chervitz (sac@bioperl.org) | |
5 # CREATED : 28 Apr 1998 | |
6 # STATUS : Alpha | |
7 # REVISION: $Id: HTML.pm,v 1.15 2002/11/04 09:12:51 heikki Exp $ | |
8 # | |
9 # For the latest version and documentation, visit the distribution site: | |
10 # http://bio.perl.org/Projects/Blast/ | |
11 # | |
12 # To generate documentation, run this module through pod2html | |
13 # (preferably from Perl v5.004 or better). | |
14 # | |
15 # CUSTOMIZATION NOTE: | |
16 # | |
17 # If your Blast reports are not getting marked up correctly, add or | |
18 # modify the regexps in _markup_report() to accomodate the format of | |
19 # your reports. | |
20 # | |
21 # Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved. | |
22 # This module is free software; you can redistribute it and/or | |
23 # modify it under the same terms as Perl itself. | |
24 #------------------------------------------------------------------------------- | |
25 | |
26 package Bio::Tools::Blast::HTML; | |
27 use strict; | |
28 use Exporter; | |
29 | |
30 use Bio::Tools::WWW qw(:obj); | |
31 | |
32 use vars qw( @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS | |
33 $ID %DbUrl %SGDUrl $Revision | |
34 $Acc $Pir_acc $Word $Signif $Int $Descrip); | |
35 | |
36 @ISA = qw(Exporter); | |
37 @EXPORT = qw(); | |
38 @EXPORT_OK = qw(&get_html_func &strip_html); | |
39 %EXPORT_TAGS = ( std => [qw(&get_html_func &strip_html)] ); | |
40 | |
41 $ID = 'Bio::Tools::Blast::HTML'; | |
42 $Revision = '$Id: HTML.pm,v 1.15 2002/11/04 09:12:51 heikki Exp $'; #' | |
43 | |
44 my $_set_markup = 0; | |
45 my $_gi_link = ''; | |
46 | |
47 | |
48 ## POD Documentation: | |
49 | |
50 =head1 NAME | |
51 | |
52 Bio::Tools::Blast::HTML - Bioperl Utility module for HTML formatting Blast reports | |
53 | |
54 =head1 SYNOPSIS | |
55 | |
56 =head2 Adding HTML-formatting | |
57 | |
58 use Bio::Tools::Blast::HTML qw(&get_html_func); | |
59 | |
60 $func = &get_html_func(); | |
61 | |
62 # Now as each line of the report is read, pass it to &$func($line). | |
63 | |
64 See L<get_html_func()|get_html_func> for details. | |
65 Also see B<Bio::Tools::Blast::to_html> for an example of usage. | |
66 | |
67 | |
68 =head2 Removing HTML-formatting | |
69 | |
70 use Bio::Tools::Blast::HTML qw(&strip_html); | |
71 | |
72 &strip_html(\$blast_report_string) | |
73 | |
74 See L<strip_html()|strip_html> for details. | |
75 | |
76 | |
77 =head1 INSTALLATION | |
78 | |
79 This module is included with the central Bioperl distribution: | |
80 | |
81 http://bio.perl.org/Core/Latest | |
82 ftp://bio.perl.org/pub/DIST | |
83 | |
84 Follow the installation instructions included in the README file. | |
85 | |
86 =head1 DESCRIPTION | |
87 | |
88 This module can be used to add HTML formatting to or remove HTML | |
89 formatting from a raw Blast sequence analysis report. Hypertext links | |
90 to the appropriate database are added for each hit sequence (GenBank, | |
91 Swiss-Prot, PIR, PDB, SGD). | |
92 | |
93 This module is intended for use by Bio::Tools::Blast.pm and related modules, | |
94 which provides a front-end to the methods in Bio::Tools::Blast::HTML.pm. | |
95 | |
96 =head1 DEPENDENCIES | |
97 | |
98 Bio::Tools::Blast::HTML.pm does not inherit from any other class | |
99 besides Exporter. It is used by B<Bio::Tools::Blast.pm> only. This | |
100 class relies on B<Bio::Tools::WWW.pm> to provide key URLS for adding | |
101 links in the Blast report to specific databases. | |
102 | |
103 The greatest dependency comes from the dynamic state of the web. URLs | |
104 are are likely to change in the future, so all links cannot be | |
105 guaranteed to work indefinitely. Feel free to report broken or | |
106 incorrect database links (L<FEEDBACK | FEEDBACK>). Thanks! | |
107 | |
108 =head1 SEE ALSO | |
109 | |
110 Bio::Tools::Blast.pm - Blast object. | |
111 Bio::Tools::WWW.pm - URL repository. | |
112 | |
113 http://bio.perl.org/Projects/modules.html - Online module documentation | |
114 http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project | |
115 http://bio.perl.org/ - Bioperl Project Homepage | |
116 | |
117 =head1 FEEDBACK | |
118 | |
119 =head2 Mailing Lists | |
120 | |
121 User feedback is an integral part of the evolution of this and other | |
122 Bioperl modules. Send your comments and suggestions preferably to one | |
123 of the Bioperl mailing lists. Your participation is much appreciated. | |
124 | |
125 bioperl-l@bioperl.org - General discussion | |
126 http://bio.perl.org/MailList.html - About the mailing lists | |
127 | |
128 =head2 Reporting Bugs | |
129 | |
130 Report bugs to the Bioperl bug tracking system to help us keep | |
131 track the bugs and their resolution. Bug reports can be submitted | |
132 via email or the web: | |
133 | |
134 bioperl-bugs@bio.perl.org | |
135 http://bugzilla.bioperl.org/ | |
136 | |
137 =head1 AUTHOR | |
138 | |
139 Steve Chervitz, E<lt>sac@bioperl.orgE<gt> | |
140 | |
141 =head1 COPYRIGHT | |
142 | |
143 Copyright (c) 1998-2000 Steve Chervitz. All Rights Reserved. | |
144 This module is free software; you can redistribute it and/or | |
145 modify it under the same terms as Perl itself. | |
146 | |
147 | |
148 =cut | |
149 | |
150 | |
151 # | |
152 ## | |
153 ### | |
154 #### END of main POD documentation. | |
155 ### | |
156 ## | |
157 #' | |
158 | |
159 | |
160 ###################### BEGIN FUNCTIONS ######################## | |
161 | |
162 =head1 APPENDIX | |
163 | |
164 Methods beginning with a leading underscore are considered private | |
165 and are intended for internal use by this module. They are | |
166 B<not> considered part of the public interface and are described here | |
167 for documentation purposes only. | |
168 | |
169 | |
170 | |
171 =head2 get_html_func | |
172 | |
173 Usage : $func_ref = &get_html_func( [array_ref] ); | |
174 : This method is exported. | |
175 Purpose : Provides a function that adds HTML formatting to a | |
176 : raw Blast report line-by-line. | |
177 : Utility method used by to_html() in Bio::Tools::Blast.pm. | |
178 Returns : Reference to an anonymous function to be used while reading in | |
179 : the raw report. | |
180 : The function itself operates on the Blast report line-by-line | |
181 : HTML-ifying it and printing it to STDOUT (or saving in the supplied | |
182 : array ref) as it goes: | |
183 : foreach( @raw_report ) { &$func_ref($_); } | |
184 Argument : array ref (optional) for storing the HTML-formatted report. | |
185 : If no argument is supplied, HTML output is sent to STDOUT. | |
186 Throws : Croaks if an argument is supplied and is not an array ref. | |
187 : The anonymous function returned by this method croaks if | |
188 : the Blast output appears to be HTML-formatted already. | |
189 Comments : Adapted from a script by Keith Robison November 1993 | |
190 : krobison@nucleus.harvard.edu; http://golgi.harvard.edu/gilbert.html | |
191 : Modified extensively by Steve Chervitz and Mike Cherry. | |
192 : Some modifications are customizations for BLAST reports served up | |
193 : by the Saccharomyces Genome Database. | |
194 : Feel free to modify or replace portions of this code as necessary | |
195 : to accomodate new BLAST datasets or changes to the Blast format. | |
196 | |
197 See Also : B<Bio::Tools::Blast::to_html()> | |
198 | |
199 =cut | |
200 | |
201 #-------------------- | |
202 sub get_html_func { | |
203 #-------------------- | |
204 my ($out_aref) = @_; | |
205 | |
206 ## Key booleans used in parsing. | |
207 my $found_table = 0; # Located the table at top of report (a.k.a. 'descriptions'). | |
208 my $found_data = 0; # Nothing is done until this is true | |
209 my $skip = 0; # Skipping various items in the report header | |
210 my $ref_skip = 0; # so we can include nice HTML versions | |
211 # (e.g., references for the BLAST program). | |
212 my $getNote = 0; | |
213 my $getGenBankAlert = 0; | |
214 my $str = ''; | |
215 my $gi_link = \$_gi_link; | |
216 my $prog = ''; | |
217 | |
218 if( defined($out_aref) and not ref($out_aref) eq 'ARRAY') { | |
219 croak("Argument must be an ARRAY ref not a ${\ref $out_aref}."); | |
220 } | |
221 | |
222 my $refs = &_prog_ref_html; | |
223 | |
224 &_set_markup_data() if not $_set_markup; | |
225 | |
226 return sub { | |
227 # $_ contains a single line from a Blast report. | |
228 local $_ = shift; | |
229 | |
230 croak("Report appears to be HTML formatted already.") if m/<HTML>|<TITLE>|<PRE>/i; | |
231 | |
232 if(not $found_table) { | |
233 if($ref_skip) { | |
234 # Replacing an reference data with special HTML. | |
235 $ref_skip = 0 if /^\s+$/; | |
236 } | |
237 if($getNote) { | |
238 ## SAC: created this test since we are no longer reading from STDIN. | |
239 $out_aref ? push(@$out_aref, $_) : print $_; | |
240 $getNote = 0 if m/^\s+$/; | |
241 } elsif( m/(.*), Up \d.*/ or /Date: +(.+)/ or /Start: +(.+?) +End:/ ) { | |
242 ### Network BLAST reports from NCBI are time stamped as follows: | |
243 #Fri Apr 18 15:55:41 EDT 1997, Up 1 day, 19 mins, 1 user, load: 19.54, 19.13, 17.77 | |
244 my $date = "<b>BLASTed on:</b> $1<p>\n"; | |
245 $out_aref ? push(@$out_aref, $date) : print $date; | |
246 } elsif ( /^(<\w+>)?(T?BLAST[NPX])\s+(.*?)/ ) { | |
247 $found_data = 1; | |
248 local($^W) = 0; | |
249 s#(\S+)\s+(.*)#<P><B>Program:</B> $1 $2 $3<br>#o; | |
250 $out_aref ? push(@$out_aref, $_) : print $_; | |
251 $skip = 1; | |
252 $prog = $2; | |
253 if($prog =~ /BLASTN/) { | |
254 ## Prevent the error at Entrez when you ask for a nucl | |
255 ## entry with a protein GI number. | |
256 $$gi_link = $DbUrl{'gb_n'}; # nucleotide | |
257 } else { | |
258 $$gi_link = $DbUrl{'gb_p'}; # protein | |
259 } | |
260 } elsif ( m/^Query=/ ) { | |
261 # Keeping the "Query=" format to keep it parsable by Blast.pm | |
262 # (after stripping HTML). | |
263 s#Query= *(.*)#<title>$1</title>\n<p><b>Query=</b> $1#o; | |
264 $out_aref ? push(@$out_aref, $_) : print $_; | |
265 $skip = 1; | |
266 } elsif ( /Reference:/) { | |
267 $ref_skip = 1; | |
268 } elsif ( /^Database:/ ) { | |
269 &_markup_database(\$_); | |
270 $out_aref ? push(@$out_aref, $_) : print $_; | |
271 if ( /non-redundant genbank/i and $prog =~ /TBLAST[NX]/i) { | |
272 $getGenBankAlert = 1; | |
273 } | |
274 $skip = 1; | |
275 } elsif ( /sequences;/ ) { | |
276 $str = "$_<p>"; | |
277 $out_aref ? push(@$out_aref, $str) : print $str; | |
278 } elsif ( /^\s+\(\d+ letters\)\s+/ ) { | |
279 $str = "<br>    $_"; | |
280 $out_aref ? push(@$out_aref, $str) : print $str; | |
281 } elsif ( /^(WARNING|NOTICE):/i ) { | |
282 s#WARNING: *(.*)#<p><b><font color="red">$1:</font></b> $1#o; | |
283 $out_aref ? push(@$out_aref, $_) : print $_; | |
284 $getNote = 1; | |
285 } elsif ( /Score +E\s*$/ or /Probability\s*$/ ) { | |
286 # Put the last HTML-formatted lines before the main body of report. | |
287 $found_table = 1; | |
288 $skip = 0; | |
289 $out_aref ? push(@$out_aref, $refs) : print $refs; | |
290 if($getGenBankAlert) { | |
291 $str = &_genbank_alert; | |
292 $out_aref ? push(@$out_aref, $str) : print $str; | |
293 } | |
294 $str = "\n<p><pre>"; | |
295 $out_aref ? push(@$out_aref, $str) : print $str; | |
296 } | |
297 | |
298 } else { | |
299 &_markup_report(\$_); | |
300 } | |
301 | |
302 if ($found_data and not($skip or $ref_skip)) { | |
303 $out_aref ? push(@$out_aref, $_) : print $_; | |
304 } | |
305 1; | |
306 } # end sub {} | |
307 } | |
308 | |
309 | |
310 | |
311 | |
312 =head2 _set_markup_data | |
313 | |
314 Usage : n/a; utility method used by get_html_func() | |
315 Purpose : Sets various hashes and regexps used for adding HTML | |
316 : to raw Blast output. | |
317 Returns : n/a | |
318 Comments : These items need be set only once. | |
319 | |
320 See Also : L<get_html_func()|get_html_func> | |
321 | |
322 =cut | |
323 | |
324 #------------------- | |
325 sub _set_markup_data { | |
326 #------------------- | |
327 %DbUrl = $BioWWW->search_url('all'); | |
328 %SGDUrl = $BioWWW->sgd_url('all'); | |
329 | |
330 $Signif = '[\de.-]{3,}'; # Regexp for a P-value or Expect value. | |
331 $Int = ' *\d\d*'; # Regexp for an integer. | |
332 $Descrip = ' +.* {2,}?'; # Regexp for a description line. | |
333 $Acc = '[A-Z][\d.]+'; # Regexp for GB/EMBL/DDJB/SP accession number | |
334 $Pir_acc = '[A-Z][A-Z0-9]{5,}'; # Regexp for PIR accession number | |
335 $Word = '[\w_.]+'; # Regexp for a word. Include dot for version. | |
336 | |
337 $_set_markup = 1; | |
338 } | |
339 | |
340 | |
341 =head2 _markup_database | |
342 | |
343 Usage : n/a; utility method used by get_html_func() | |
344 Purpose : Converts a cryptic database ID into a readable name. | |
345 Returns : n/a | |
346 Comments : This is used for converting local database IDs into | |
347 : understandable terms. At present, it only recognizes | |
348 : databases used locally at SGD. | |
349 | |
350 See Also : L<get_html_func()|get_html_func> | |
351 | |
352 =cut | |
353 | |
354 #--------------------- | |
355 sub _markup_database { | |
356 #--------------------- | |
357 my $line_ref = shift; | |
358 local $_ = $$line_ref; | |
359 | |
360 $_ =~ s#YeastN#<i>S. cerevisiae</i> GenBank Data Set; #; | |
361 $_ =~ s#YeastP#Non-Redundant <i>S. cerevisiae</i> Protein Data Set; #; | |
362 $_ =~ s#genoSC#Complete DNA Sequence for the S. cerevisiae Genome; #; | |
363 $_ =~ s#YeastORF-P#Translation of all Standard S.c. ORFs; #; | |
364 $_ =~ s#YeastORF-N#Coding Sequence of all Standard S.c. ORFs; #; | |
365 s#Database: *(.*)#<p><b>Database:</b> $1#o; | |
366 | |
367 $$line_ref = $_; | |
368 } | |
369 | |
370 | |
371 =head2 _markup_report | |
372 | |
373 Usage : n/a; utility function used by get_html_func() | |
374 Purpose : Adds HTML links to aid navigation of raw Blast output. | |
375 Returns : n/a | |
376 Comments : HTML-formatting is dependent on the Blast server that | |
377 : provided the Blast report. Currently, this function can handle reports | |
378 : produced by NCBI and SGD. Feel free to modify this function | |
379 : to accomodate reports produced by other servers/sites. | |
380 : | |
381 : This function is simply a collection of substitution regexps | |
382 : that recognize and modify the relevant lines of the Blast report. | |
383 : All non-header lines of the report are passed through this function, | |
384 : only the ones that match will get modified. | |
385 : | |
386 : The general scheme for adding links is as follows: | |
387 : (Some of the SGD markups do not follow this scheme precisely | |
388 : but this is the general trend.) | |
389 : | |
390 : For description lines in the summary table at the top of report: | |
391 : | |
392 : DB:SEQUENCE_ID DESCRIPTION SIGNIF_VAL | |
393 : DB = links to the indicated database (if not Gen/Embl/Ddbj). | |
394 : SEQUENCE_ID = links to GenBank entry for the sequence. | |
395 : SIGNIF_VAL = internal link to relevant alignment section. | |
396 : | |
397 : For the alignment sections in the body of the report: | |
398 : | |
399 : DB:SEQUENCE_ID (Back | Top) DESCRIPTION | |
400 : DB = links to the indicated database (if not Gen/Embl/Ddbj). | |
401 : SEQUENCE_ID = links to GenBank entry for the sequence. | |
402 : SIGNIF_VAL = internal link to alignment section. | |
403 : Back = internal link to description line in summary section. | |
404 : Top = internal link to top of page. | |
405 : | |
406 : 'DB' links are created for PDB, PIR, and SwissProt sequences. | |
407 : | |
408 : RE_PARSING HTML-FOMRATTED REPORTS: | |
409 : ---------------------------------- | |
410 : HTML-formatted reports generated by this module, as well as reports | |
411 : obtained from the NCBI servers, should be parsable | |
412 : by Bio::Tools::Blast.pm. Parsing HTML-formatted reports is | |
413 : slow, however, since the HTML must be removed prior to parsing. | |
414 : Parsing HTML-formatted reports is dependent on the specific structure | |
415 : of the HTML and is generally not recommended. | |
416 : | |
417 : Note that since URLs can change without notice, links will need updating. | |
418 : The links are obtained from Bio::Tools::WWW.pm updating that module | |
419 : will update this as well. | |
420 : | |
421 Bugs : Some links to external databases are incorrect | |
422 : (in particular, for 'bbs' and 'prf' databases on NCBI Blast reports. | |
423 : Some links may fail as a result of the dynamic nature of the web. | |
424 : Hypertext links are not added to hits without database ids. | |
425 | |
426 See Also : L<get_html_func()|get_html_func>, B<Bio::Tools::WWW.pm>, L<strip_html>() | |
427 | |
428 =cut | |
429 | |
430 #-------------------- | |
431 sub _markup_report { | |
432 #-------------------- | |
433 my $line_ref = shift; | |
434 local $_ = $$line_ref; | |
435 ## | |
436 ## REGEXPS FOR ALIGNMENT SECTIONS (within the body of the report, | |
437 ## the text above the list of HSPs). | |
438 ## | |
439 ## If the HSP alignment sections don't start with a '>' we have no way | |
440 ## of finding them. This occurs with reports saved from HTML-formatted | |
441 ## web pages, which we shouldn't be processing here anyway. | |
442 | |
443 ## To facilitate parsing of HTML-formatted reports by Bio::Tools::Blast.pm, | |
444 ## the <a name=...> anchors should be added at the BEGINNING of the HSP | |
445 ## alignment section lines and at the END of the description section lines. | |
446 | |
447 # Removing " ! " addded by GCG. | |
448 s/ ! / /; | |
449 | |
450 ### NCBI-specific markups for HSP alignment section lines: | |
451 | |
452 local($^W) = 0; | |
453 | |
454 # GenBank/EMBL, DDBJ hits (GenBank Format): | |
455 s@^>(gb|emb|dbj|ref)\|($Word)(\|$Word)?(.*)$@<a name=$2_A></a><b>$1:<a href="$_gi_link$2">$2$3</a></b>$4<br>(<a href="\#$2_H">Back|<a href="\#top">Top</a>)@o; | |
456 | |
457 s@^>(gb|emb|dbj|ref)\|($Word)(\| \(?$Word\)?)(.*)$@<a name=$2_A></a><b>$1:<a href="$_gi_link$2">$2</a></b>$3$4<br>(<a href="\#$2_H">Back|<a href="\#top">Top</a>)@o; | |
458 | |
459 # PIR hits | |
460 s@^>pir\|\|($Word)( .*)$@<a name=$1_A></a><b><a href=\"$DbUrl{'pir_acc'}$1\">pir</a>:<a href="$DbUrl{'gb_p'}$1">$1</a></b> $2 <br>(<a href="\#$1_H">Back|<a href="\#top">Top</a>)@o; | |
461 | |
462 # GI hits (GenBank Format): using a nested (()) | |
463 s@^>(gi)\|($Word)( +\(($Word)\))( .*)$@<a name=$4_A></a><b>$1:<a href="$_gi_link$4">$2</a></b>$3$5<br>(<a href="\#$4_H">Back|<a href="\#top">Top</a>)@o; | |
464 | |
465 # GNL PID hits (GenBank Format): | |
466 s@^>(gnl)\|($Word)?(\|$Word) +\(($Word)\)( .*)$@<a name=$4_A></a><b>$1:<a href="$_gi_link$4">$2$3</a></b>($4)$5<br>(<a href="\#$4_H">Back|<a href="\#top">Top</a>)@o; | |
467 | |
468 # BBS and PRF hits (what db?) (GenBank Format): | |
469 s@^>(bbs|prf)\|\|?($Word)( .*)$@<a name=$2_A></a><b>$1:<a href="$_gi_link$2">$2</a></b>$3<br>(<a href="\#$2_H">Back|<a href="\#top">Top</a>)@o; | |
470 | |
471 # SwissProt hits: | |
472 s@^>sp\|($Word)\|($Word)?( .*)$@<a name=$1_A></a><b><a href="$DbUrl{'swpr'}$1">sp</a>:<a href="$DbUrl{'gb_p'}$1">$1|$2</a></b>$3<br>(<a href="\#$1_H">Back|<a href="\#top">Top</a>)@o; | |
473 | |
474 | |
475 ## PDB ids with or without a chain identifier (GenBank format) | |
476 s@^>pdb\|(\d\w{3})\|[\w ] (.*)$@<a name=$1_A></A><b><a href=\"$DbUrl{'3db'}$1\">pdb</A>:<a href="$DbUrl{'gb_struct'}$1">$1</a></b> (<a href="\#$1_H">Back</a>|<a href="\#top">Top</a>) $2@o; | |
477 | |
478 | |
479 ### SGD-specific markups for HSP alignment section lines: | |
480 | |
481 ## PDB ids without chain identifier | |
482 s@^>PDB_UNIQUEP:(\d\w{3})_ (.*)$@<a name=$1_A></A><b><A HREF="$DbUrl{'3db'}$1">PDB</a>:<A HREF="$DbUrl{'gb_struct'}$1">$1</A></b> (<a href="\#$1_H">Back</a>|<a href="\#top">Top</a>) $2@o; | |
483 | |
484 ## PDB ids with chain identifier | |
485 s@^>PDB_UNIQUEP:(\d\w{3})_([\w ]{1})(.*)$@<a name=$1_A></A><b><A HREF="$DbUrl{'3db'}$1">PDB</a>:<A HREF="$DbUrl{'gb_struct'}$1">$1</A></b> Chain:$2, (<a href="\#$1_H">Back</a>|<a href="\#top">Top</a>) $3@o; | |
486 | |
487 s@^>($Word)PEPT:GI_(\d+)(.*)$@<a name=$2_A></a><b>$1:<a href="$DbUrl{'gb_p'}$2">GI_$2</a></b> $3 <br>(<a href="\#$2_H">Back|<a href="\#top">Top</a>)@o; | |
488 | |
489 # The gcg blast dataset generating tools up-case all sbjct sequence IDs. | |
490 # This is fine for yeast but not worm. This is considered a hack here. | |
491 s@WORMPEPT:(\w+\.)(\S+)@WORMPEPT:$1\L$2\E@; | |
492 | |
493 s@^>WORMPEPT:(\S+)(.*)$@<a name=$1_A></a><b>WORMPEP:<A HREF="$DbUrl{'wormace'}$1">$1</a></b> $2 <br>(<a href="\#$1_H">Back|<a href="\#top">Top</a>)@o; | |
494 | |
495 s#^>(GB_$Word):($Word) ($Acc) (.*$)#<a name=$2_$3_A></A><a href=\#$2_$3_H>$2|$3</A>$4\t<b>[<A HREF=$_gi_link$3>GenBank</A> / <A HREF=$DbUrl{'embl'}$3>EMBL</A> / <A HREF=\"$SGDUrl{'seq_an'}$2\*\">SGD</A>]</b> #o; | |
496 | |
497 # Sac's version: ORF name is an external link into SGD: | |
498 s@^>ORFP:(\S*) +([\w-]+)(.*$)@<a name=$1_A></A>ORFP:<a href=\"$SGDUrl{'locus'}$2\">$1 $2</A>$3<br>     <b>[<A HREF=\"$SGDUrl{'seq_an'}$2\">Gene/Sequence Resources</a> / <a href=\"$SGDUrl{'map_orf'}$2\">ORF Map</a></b>] <a href="\#$1_H">Back</a>|<a href="\#top">Top</a>@o; | |
499 | |
500 # Mike's version: | |
501 # s#^>ORFP:(\S*) (.*$)#<a name=$1_A></A><a href=\#$1_H>ORFP:$1</A> $2\t<b>[<A HREF=\"$SGDUrl{'seq_an'}$1\">Gene/Sequence Resources</a> / <a href=\"$SGDUrl{'map_orf'}$1\">ORF Map</a>]</b> #o; | |
502 | |
503 s#^>ORFN:(\S*) (.*$)#<a name=$1_A></A><a href=\#$1_H>ORFN:$1</A> $2\t<b>[<A HREF=\"$SGDUrl{'seq_an'}$1\">Gene/Sequence Resources</a>] / <a href=\"$SGDUrl{'map_orf'}$1\">ORF Map</a></b> #o; | |
504 | |
505 s#^>NR_SC:GP-\S* gi\|(\w+)([\w\|]*) (.*$)#<a name=$1_A></A><a href=\#$1_H>GenPept|$1</A> gp|$2 $3\t<b>[<A HREF=$DbUrl{'gb_p'}$1>GenPept</A> / <A HREF=\"$SGDUrl{'gi'}$1\*\">SGD</A>]</b> #o; | |
506 | |
507 s#^>NR_SC:SW-$Word SW:($Word) ($Acc) (.*$)#<a name=$1_A></A><a href=\#$1_H>SWISS|$1 $2</A> $3\t<b>[<a href=$DbUrl{'swpr'}$2>SwissProt</a> / <A HREF=$DbUrl{'gb_p'}$2>Entrez</A>]</b>#o; | |
508 | |
509 s#^>NR_SC:PIR-$Word PIR:($Word) (.*$)#<a name=$1_A> </A><a href=\#$1_H>PIR|$1</A> $2\t<b>[<a href=$DbUrl{'pir_uid'}$1>PIR</a> / <A HREF=$DbUrl{'gb_p'}$1>Entrez</A>]</b>#o; | |
510 | |
511 s#^>CHRS:([A-Z][0-9]*) (.*)$#<a name=$1_A></a><a href=\#$1_H>$1</A> $2: [<b><a href=$SGDUrl{'seq_an'}$1>Gene/Sequence Resources</A> / <a href=\"$SGDUrl{'map_chr'}$1\">ORF Map</a></b>]#o; | |
512 | |
513 s#^>NOT:([A-Z]_[0-9]*-[0-9]*)( *)Chromosome ([0-9]*) from ([0-9]*) to ([0-9]*)$#<a name=$1_A></a><a href=\#$1_H>$1</A> $2Chromosome $3 from $4 to $5 [<b><a href=$SGDUrl{'chr'}$3\&beg=$4\&end=$5>Gene/Sequence Resources</a> / <a href=\"$SGDUrl{'map_chr'}$3\&beg=$4\&end=$5\">ORF Map</a> / <a href=\"$SGDUrl{'chr_old'}$3\&beg=$4\&end=$5\">Retrieve DNA</a></b>]#o; | |
514 | |
515 s#^>UTR5_SC_[0-9]*:(\S*) 5' untranslated region, chr(\S*) ([0-9]*) - ([0-9]*)(.*$)#<a name=$1_A></A><a href=\#$1_H>UTR5:$1</A> $1 5' untranslated region, chr$2 $3 - $4, $5\t<b>[<A HREF=\"$SGDUrl{'chr'}$2&beg=$3&end=$4\">Gene/Sequence Resources</A> / <a href=\"$SGDUrl{'map_chr'}$2\&beg=$3\&end=$4\">ORF Map</a>]</b>#o; | |
516 | |
517 # Hits without a db identifier. | |
518 # If any of the previous regexps succeed, the leading '>' will be removed. | |
519 # Otherwise, this regexp could cause trouble. | |
520 s@^>($Word)(.*)$@<a name=$1_A></a>$1 $2<br>(<a href="\#$1_H">Back|<a href="\#top">Top</a>)@o; | |
521 | |
522 ## | |
523 ## REGEXPS FOR SUMMARY TABLE LINES AT TOP OF REPORT (a.k.a. 'descriptions') | |
524 ## (table of sequence id, description, score, P/Expect value, n) | |
525 ## | |
526 ## Not using bold face to highlight the sequence id's since this can throw off | |
527 ## off formatting of the line when the IDs are different lengths. This lead to | |
528 ## the scores and P/Expect values not lining up properly. | |
529 | |
530 ### NCBI-specific markups for description lines: | |
531 | |
532 # GenBank/EMBL, DDBJ hits (GenBank Format): | |
533 s@^ ?(gb|emb|dbj|ref)\|($Word)(\|$Word)?($Descrip)($Int +)($Signif)(.*)$@$1:<a href="$_gi_link$2">$2$3</a>$4$5<A href="\#$2_A">$6</a>$7<a name="$2_H"></a>@o; | |
534 | |
535 s@^ ?(gb|emb|dbj|ref)\|($Word)(\| \(?$Word\)?)($Descrip)($Int +)($Signif)(.*)$@$1:<a href="$_gi_link$2">$2</a>$3$4$5<A href="\#$2_A">$6</a>$7<a name="$2_H"></a>@o; | |
536 | |
537 # Missing inner ID | |
538 s@^ ?pir\|\|($Word)?($Descrip)($Int) ($Signif)(.*)$@<a href="$DbUrl{'pir_acc'}$1">pir</a>:<a href="$DbUrl{'gb_p'}$1">$1</a> $2$3 <A href="\#$1_A">$4</a>$5<a name="$1_H"></a>@o; | |
539 | |
540 # GI hits (GenBank Format): using a nested (()) | |
541 s@^ ?gi\|($Word)( +\(($Word)\))($Descrip)($Int) ($Signif)(.*)$@gi:<a href="$_gi_link$3">$1</a>$2$4$5 <A href="\#$3_A">$6</a>$7<a name="$3_H"></a>@o; | |
542 | |
543 s@^ ?(gnl)\|($Word)?(\|$Word +)\(($Word)\)($Descrip)($Int) ($Signif)(.*)$@$1:<a href="$_gi_link$4">$2$3</a>($4)$5$6 <A href="\#$4_A">$7</a>$8<a name="$4_H"></a>@o; | |
544 | |
545 | |
546 s@^ ?(bbs|prf)\|\|?($Word)($Descrip)($Int) ($Signif)(.*)$@$1:<a href="$_gi_link$2">$2</a> $3$4 <A href="\#$2_A">$5</a>$6<a name="$2_H"></a>@o; | |
547 | |
548 | |
549 ## SwissProt accessions (GenBank format) | |
550 s@^ ?sp\|($Word)(\|$Word)?($Descrip)($Int) ($Signif)(.*)$@<a href="$DbUrl{'swpr'}$1">sp</a>:<a href="$DbUrl{'gb_p'}$1">$1$2</a>$3$4 <a href="\#$1_A">$5</a>$6<a name="$1_H"></a>@o; | |
551 | |
552 ## PDB ids with or without a chain ID (GenBank format) | |
553 s@^ ?pdb\|($Word)\|($Word)?($Descrip)($Int) ($Signif)(.*)$@<a href="$DbUrl{'3db'}$1">pdb</a>:<a href="$DbUrl{'gb_struct'}$1">$1_$2</a>$3$4 <a href="\#$1_A">$5</a>$6<a name="$1_H"></a>@o; | |
554 | |
555 | |
556 ### SGD-specific markups for description lines: | |
557 | |
558 ## PDB ids without chain identifier | |
559 s@^ ?PDB_UNIQUEP:(\d\w{3})_($Descrip)($Int) ($Signif)(.*)$@<a href="$DbUrl{'3db'}$1">PDB</a>:<A HREF="$DbUrl{'gb_struct'}$1">$1</A> $2$3 <a href="\#$1_A">$4</a>$5<a name="$1_H"></a>@o; | |
560 | |
561 | |
562 ## PDB ids with chain identifier | |
563 s@^ ?PDB_UNIQUEP:(\d\w{3})_(\w)($Descrip)($Int) ($Signif)(.*)$@<a href="$DbUrl{'3db'}$1">PDB</a>:<A HREF="$DbUrl{'gb_struct'}$1">$1</A> Chain:$2$3$4 <a href="\#$1_A">$5</a>$6<a name="$1_H"></a>@o; | |
564 | |
565 | |
566 s@^ ?($Word)PEPT:GI_(\d+)($Descrip)($Int) ($Signif)(.*)$@$1:<A HREF="$DbUrl{'gb_p'}$2">GI_$2</A> $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>@o; | |
567 | |
568 s@^ *WORMPEPT:(\S+)($Descrip)($Int) ($Signif)(.*)$@WORMPEP:<A HREF="$DbUrl{'wormace'}$1">$1</a> $2 $3 <a href="\#$1_A">$4</a>$5<a name="$1_H"></a>@o; | |
569 | |
570 ## Mike Cherry's markups. SAC note: added back database name to allow | |
571 ## the HTML-formatted version to be parsable by Blast.pm. | |
572 | |
573 s#^ ?(GB_$Word:)($Word)( *)($Acc)($Descrip)($Int) ( *$Signif) ( *\d*)$#GenBank\|<a href="$_gi_link$4">$2</A>\|$4 $3$5$6 <a href="\#$2_$4_A">$7</A> $8<a name="$2_$4_H"></A>#o; | |
574 | |
575 # Mike's version: | |
576 # s#^ ?(ORFP:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#$1<b>$2</b> $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>#o; | |
577 | |
578 # My modification: | |
579 s@^ ?ORFP:(\S*) +([\w-]+)(.*[ ]{2,3})($Int) ($Signif) ($Int)$@ORFP:<A HREF=\"$SGDUrl{'locus'}$2\">$1 $2</A>$3$4 <a href="\#$1_A">$5</a> $6<a name="$1_H"></a>@o; | |
580 | |
581 s#^ ?(ORFN:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#$1$2 $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>#o; | |
582 | |
583 s#^ ?(NR_SC:GP-)(\S*) ( *)gi\|(\w+)([\w\|]*)($Descrip)($Int) ($Signif) ($Int)$#GenPept\|<a href="$DbUrl{'gb_p'}$4">$4</A>$3 gp|$2 $5$6$7 <a href="\#$4_A">$8</A> $9<a name="$4_H"></A>#o; | |
584 | |
585 s#^ ?(NR_SC:SW-)$Word ( *)SW:($Word) ($Acc)($Descrip)($Int) ($Signif) ($Int)$#SWISS\|<a href="$DbUrl{'swpr'}$4">$3</A> SW:$3 $4 $5$6 <a href="\#$3_A">$7</A> $8<a name="$3_H"></A>#o; | |
586 | |
587 s#^ ?(NR_SC:PIR-)$Word ( *)PIR:($Word)($Descrip)($Int) ($Signif) ($Int)$#PIR\|<a href="$DbUrl{'pir_uid'}$3">$3</A> $2 PIR:$3 $4$5 <a href="\#$3_A">$6</A> $7<a name="$3_H"></A>#o; | |
588 | |
589 s#^ ?(CHRS:)([A-Z][0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1Segment:$2 $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>#o; | |
590 | |
591 s#^ ?(CHR[0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1 $2 $3 <a href="\#$1_A">$4</a> $5<a name="$1_H"></a>#o; | |
592 | |
593 s#^ ?(NOT:)([A-Z]_[0-9]*-[0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1$2 $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>#o; | |
594 | |
595 s#^ ?(UTR5_SC_[0-9]*:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#UTR5:$2 $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>#o; | |
596 | |
597 # Hits without a db identifier. | |
598 s@^ ?($Word)($Descrip)($Int) ($Signif)(.*)$@$1$2$3 <A href="\#$1_A">$4</a>$5<a name="$1_H"></a>@o; | |
599 | |
600 $$line_ref = $_; | |
601 } | |
602 | |
603 | |
604 | |
605 | |
606 =head2 _prog_ref_html | |
607 | |
608 Usage : n/a; utility method used by get_html_func(). | |
609 Purpose : Get a special alert for BLAST reports against all of GenBank/EMBL. | |
610 Returns : string with HTML | |
611 | |
612 See Also : L<get_html_func()|get_html_func> | |
613 | |
614 =cut | |
615 | |
616 #------------------ | |
617 sub _prog_ref_html { | |
618 #------------------ | |
619 return <<"QQ_REF_QQ"; | |
620 <p> | |
621 <small> | |
622 <b>References:</b> | |
623 <ol> | |
624 <li>Altschul, Stephen F., Warren Gish, Webb Miller, Eugene W. Myers, and David J. Lipman (1990). | |
625 Basic local alignment search tool. | |
626 <a href="http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2231712&form=6&db=m&Dopt=r">J. Mol. Biol. 215: 403-10</a>. | |
627 <li>Altschul et al. (1997), Gapped BLAST and PSI-BLAST: | |
628 a new generation of protein database search programs. | |
629 <a href="http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=9254694&form=6&db=m&Dopt=r">Nucl. Acids Res. 25: 3389-3402</a>. | |
630 <li><b>Program Descriptions</b>: | |
631 <a href="http://www.ncbi.nlm.nih.gov/BLAST/newblast.html">BLAST2</a> | | |
632 <a href="http://blast.wustl.edu/">WU-BLAST2</a> | | |
633 <a href="http://www.ncbi.nlm.nih.gov/BLAST/blast_help.html">Help Manual</a> | |
634 </ol> | |
635 <small> | |
636 HTML formatting provided by the <a href="${\$BioWWW->home_url('bioperl')}Projects/Blast/">Bioperl Blast module</a>. | |
637 </small> | |
638 </small> | |
639 <p> | |
640 | |
641 QQ_REF_QQ | |
642 | |
643 # Not really a reference for the Blast algorithm itself but an interesting usage. | |
644 #<li>Gish, Warren, and David J. States (1993). Identification of protein coding regions by database similarity search. | |
645 #<a href="http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=8485583&form=6&db=m&Dopt=r">Nature Genetics 3:266-72</a>. | |
646 | |
647 } | |
648 | |
649 | |
650 =head2 _genbank_alert | |
651 | |
652 Usage : n/a; utility method used by get_html_func(). | |
653 Purpose : Get a special alert for BLAST reports against all of GenBank/EMBL. | |
654 Returns : string with HTML | |
655 | |
656 See Also : L<get_html_func()|get_html_func> | |
657 | |
658 =cut | |
659 | |
660 #------------------ | |
661 sub _genbank_alert { | |
662 #------------------ | |
663 return << "QQ_GENBANK_QQ"; | |
664 <p><b><font color="red">CAUTION: Hits reported on this page may be derived from DNA sequences | |
665 that contain more than one gene. | |
666 </font>To avoid mis-interpretation, always check database entries | |
667 for any sequence of interest to verify that the similarity | |
668 occurs within the described sequence. (E.g., A DNA sequence | |
669 for gene X as reported in GenBank may contain a 5' or 3' | |
670 fragment of coding sequence for a neighboring gene Y, yet will | |
671 be listed as gene X, since gene Y had not yet been identified). </b> | |
672 QQ_GENBANK_QQ | |
673 } | |
674 | |
675 | |
676 | |
677 =head2 strip_html | |
678 | |
679 Usage : $boolean = &strip_html( string_ref ); | |
680 : This method is exported. | |
681 Purpose : Removes HTML formatting from a supplied string. | |
682 : Attempts to restore the Blast report to enable | |
683 : parsing by Bio::Tools::Blast.pm. | |
684 Returns : Boolean: true if string was stripped, false if not. | |
685 Argument : string_ref = reference to a string containing the whole Blast | |
686 : report. | |
687 Throws : Croaks if the argument is not a scalar reference. | |
688 Comments : Based on code originally written by Alex Dong Li | |
689 : (ali@genet.sickkids.on.ca). | |
690 : This method does some Blast-specific stripping | |
691 : (adds back a '>' character in front of each HSP | |
692 : alignment listing). | |
693 : | |
694 : THIS METHOD IS HIGHLY ERROR-PRONE! | |
695 : | |
696 : Removal of the HTML tags and accurate reconstitution of the | |
697 : non-HTML-formatted report is highly dependent on structure of | |
698 : the HTML-formatted version. For example, it assumes that first | |
699 : line of each alignment section (HSP listing) starts with a | |
700 : <a name=..> anchor tag. This permits the reconstruction of the | |
701 : original report in which these lines begin with a ">". | |
702 : This is required for parsing. | |
703 : | |
704 : If the structure of the Blast report itself is not intended to | |
705 : be a standard, the structure of the HTML-formatted version | |
706 : is even less so. Therefore, the use of this method to | |
707 : reconstitute parsable Blast reports from HTML-format versions | |
708 : should be considered a temorary solution. | |
709 | |
710 See Also : B<Bio::Tools::Blast::parse()> | |
711 | |
712 =cut | |
713 | |
714 #--------------- | |
715 sub strip_html { | |
716 #--------------- | |
717 # This may not best way to remove html tags. However, it is simple. | |
718 # it won't work under following conditions: | |
719 # 1) if quoted > appears in a tag (does this ever happen?) | |
720 # 2) if a tag is split over multiple lines and this method is | |
721 # used to process one line at a time. | |
722 | |
723 my $string_ref = shift; | |
724 | |
725 ref $string_ref eq 'SCALAR' or | |
726 croak ("Can't strip HTML: ". | |
727 "Argument is should be a SCALAR reference not a ${\ref $string_ref}"); | |
728 | |
729 my $str = $$string_ref; | |
730 my $stripped = 0; | |
731 | |
732 # Removing "<a name =...>" and adding the '>' character for | |
733 # HSP alignment listings. | |
734 $str =~ s/(\A|\n)<a name ?=[^>]+> ?/>/sgi and $stripped = 1; | |
735 | |
736 # Removing all "<>" tags. | |
737 $str =~ s/<[^>]+>| //sgi and $stripped = 1; | |
738 | |
739 # Re-uniting any lone '>' characters. | |
740 $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1; | |
741 | |
742 $$string_ref = $str; | |
743 $stripped; | |
744 } | |
745 | |
746 1; | |
747 __END__ | |
748 | |
749 ##################################################################################### | |
750 # END OF CLASS # | |
751 ##################################################################################### | |
752 | |
753 | |
754 |