0
|
1 #-------------------------------------------------------------------------------
|
|
2 # PACKAGE : Bio::Tools::Blast::HTML
|
|
3 # PURPOSE : To encapsulate code for HTML formatting BLAST reports.
|
|
4 # AUTHOR : Steve Chervitz (sac@bioperl.org)
|
|
5 # CREATED : 28 Apr 1998
|
|
6 # STATUS : Alpha
|
|
7 # REVISION: $Id: HTML.pm,v 1.15 2002/11/04 09:12:51 heikki Exp $
|
|
8 #
|
|
9 # For the latest version and documentation, visit the distribution site:
|
|
10 # http://bio.perl.org/Projects/Blast/
|
|
11 #
|
|
12 # To generate documentation, run this module through pod2html
|
|
13 # (preferably from Perl v5.004 or better).
|
|
14 #
|
|
15 # CUSTOMIZATION NOTE:
|
|
16 #
|
|
17 # If your Blast reports are not getting marked up correctly, add or
|
|
18 # modify the regexps in _markup_report() to accomodate the format of
|
|
19 # your reports.
|
|
20 #
|
|
21 # Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved.
|
|
22 # This module is free software; you can redistribute it and/or
|
|
23 # modify it under the same terms as Perl itself.
|
|
24 #-------------------------------------------------------------------------------
|
|
25
|
|
26 package Bio::Tools::Blast::HTML;
|
|
27 use strict;
|
|
28 use Exporter;
|
|
29
|
|
30 use Bio::Tools::WWW qw(:obj);
|
|
31
|
|
32 use vars qw( @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS
|
|
33 $ID %DbUrl %SGDUrl $Revision
|
|
34 $Acc $Pir_acc $Word $Signif $Int $Descrip);
|
|
35
|
|
36 @ISA = qw(Exporter);
|
|
37 @EXPORT = qw();
|
|
38 @EXPORT_OK = qw(&get_html_func &strip_html);
|
|
39 %EXPORT_TAGS = ( std => [qw(&get_html_func &strip_html)] );
|
|
40
|
|
41 $ID = 'Bio::Tools::Blast::HTML';
|
|
42 $Revision = '$Id: HTML.pm,v 1.15 2002/11/04 09:12:51 heikki Exp $'; #'
|
|
43
|
|
44 my $_set_markup = 0;
|
|
45 my $_gi_link = '';
|
|
46
|
|
47
|
|
48 ## POD Documentation:
|
|
49
|
|
50 =head1 NAME
|
|
51
|
|
52 Bio::Tools::Blast::HTML - Bioperl Utility module for HTML formatting Blast reports
|
|
53
|
|
54 =head1 SYNOPSIS
|
|
55
|
|
56 =head2 Adding HTML-formatting
|
|
57
|
|
58 use Bio::Tools::Blast::HTML qw(&get_html_func);
|
|
59
|
|
60 $func = &get_html_func();
|
|
61
|
|
62 # Now as each line of the report is read, pass it to &$func($line).
|
|
63
|
|
64 See L<get_html_func()|get_html_func> for details.
|
|
65 Also see B<Bio::Tools::Blast::to_html> for an example of usage.
|
|
66
|
|
67
|
|
68 =head2 Removing HTML-formatting
|
|
69
|
|
70 use Bio::Tools::Blast::HTML qw(&strip_html);
|
|
71
|
|
72 &strip_html(\$blast_report_string)
|
|
73
|
|
74 See L<strip_html()|strip_html> for details.
|
|
75
|
|
76
|
|
77 =head1 INSTALLATION
|
|
78
|
|
79 This module is included with the central Bioperl distribution:
|
|
80
|
|
81 http://bio.perl.org/Core/Latest
|
|
82 ftp://bio.perl.org/pub/DIST
|
|
83
|
|
84 Follow the installation instructions included in the README file.
|
|
85
|
|
86 =head1 DESCRIPTION
|
|
87
|
|
88 This module can be used to add HTML formatting to or remove HTML
|
|
89 formatting from a raw Blast sequence analysis report. Hypertext links
|
|
90 to the appropriate database are added for each hit sequence (GenBank,
|
|
91 Swiss-Prot, PIR, PDB, SGD).
|
|
92
|
|
93 This module is intended for use by Bio::Tools::Blast.pm and related modules,
|
|
94 which provides a front-end to the methods in Bio::Tools::Blast::HTML.pm.
|
|
95
|
|
96 =head1 DEPENDENCIES
|
|
97
|
|
98 Bio::Tools::Blast::HTML.pm does not inherit from any other class
|
|
99 besides Exporter. It is used by B<Bio::Tools::Blast.pm> only. This
|
|
100 class relies on B<Bio::Tools::WWW.pm> to provide key URLS for adding
|
|
101 links in the Blast report to specific databases.
|
|
102
|
|
103 The greatest dependency comes from the dynamic state of the web. URLs
|
|
104 are are likely to change in the future, so all links cannot be
|
|
105 guaranteed to work indefinitely. Feel free to report broken or
|
|
106 incorrect database links (L<FEEDBACK | FEEDBACK>). Thanks!
|
|
107
|
|
108 =head1 SEE ALSO
|
|
109
|
|
110 Bio::Tools::Blast.pm - Blast object.
|
|
111 Bio::Tools::WWW.pm - URL repository.
|
|
112
|
|
113 http://bio.perl.org/Projects/modules.html - Online module documentation
|
|
114 http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project
|
|
115 http://bio.perl.org/ - Bioperl Project Homepage
|
|
116
|
|
117 =head1 FEEDBACK
|
|
118
|
|
119 =head2 Mailing Lists
|
|
120
|
|
121 User feedback is an integral part of the evolution of this and other
|
|
122 Bioperl modules. Send your comments and suggestions preferably to one
|
|
123 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
124
|
|
125 bioperl-l@bioperl.org - General discussion
|
|
126 http://bio.perl.org/MailList.html - About the mailing lists
|
|
127
|
|
128 =head2 Reporting Bugs
|
|
129
|
|
130 Report bugs to the Bioperl bug tracking system to help us keep
|
|
131 track the bugs and their resolution. Bug reports can be submitted
|
|
132 via email or the web:
|
|
133
|
|
134 bioperl-bugs@bio.perl.org
|
|
135 http://bugzilla.bioperl.org/
|
|
136
|
|
137 =head1 AUTHOR
|
|
138
|
|
139 Steve Chervitz, E<lt>sac@bioperl.orgE<gt>
|
|
140
|
|
141 =head1 COPYRIGHT
|
|
142
|
|
143 Copyright (c) 1998-2000 Steve Chervitz. All Rights Reserved.
|
|
144 This module is free software; you can redistribute it and/or
|
|
145 modify it under the same terms as Perl itself.
|
|
146
|
|
147
|
|
148 =cut
|
|
149
|
|
150
|
|
151 #
|
|
152 ##
|
|
153 ###
|
|
154 #### END of main POD documentation.
|
|
155 ###
|
|
156 ##
|
|
157 #'
|
|
158
|
|
159
|
|
160 ###################### BEGIN FUNCTIONS ########################
|
|
161
|
|
162 =head1 APPENDIX
|
|
163
|
|
164 Methods beginning with a leading underscore are considered private
|
|
165 and are intended for internal use by this module. They are
|
|
166 B<not> considered part of the public interface and are described here
|
|
167 for documentation purposes only.
|
|
168
|
|
169
|
|
170
|
|
171 =head2 get_html_func
|
|
172
|
|
173 Usage : $func_ref = &get_html_func( [array_ref] );
|
|
174 : This method is exported.
|
|
175 Purpose : Provides a function that adds HTML formatting to a
|
|
176 : raw Blast report line-by-line.
|
|
177 : Utility method used by to_html() in Bio::Tools::Blast.pm.
|
|
178 Returns : Reference to an anonymous function to be used while reading in
|
|
179 : the raw report.
|
|
180 : The function itself operates on the Blast report line-by-line
|
|
181 : HTML-ifying it and printing it to STDOUT (or saving in the supplied
|
|
182 : array ref) as it goes:
|
|
183 : foreach( @raw_report ) { &$func_ref($_); }
|
|
184 Argument : array ref (optional) for storing the HTML-formatted report.
|
|
185 : If no argument is supplied, HTML output is sent to STDOUT.
|
|
186 Throws : Croaks if an argument is supplied and is not an array ref.
|
|
187 : The anonymous function returned by this method croaks if
|
|
188 : the Blast output appears to be HTML-formatted already.
|
|
189 Comments : Adapted from a script by Keith Robison November 1993
|
|
190 : krobison@nucleus.harvard.edu; http://golgi.harvard.edu/gilbert.html
|
|
191 : Modified extensively by Steve Chervitz and Mike Cherry.
|
|
192 : Some modifications are customizations for BLAST reports served up
|
|
193 : by the Saccharomyces Genome Database.
|
|
194 : Feel free to modify or replace portions of this code as necessary
|
|
195 : to accomodate new BLAST datasets or changes to the Blast format.
|
|
196
|
|
197 See Also : B<Bio::Tools::Blast::to_html()>
|
|
198
|
|
199 =cut
|
|
200
|
|
201 #--------------------
|
|
202 sub get_html_func {
|
|
203 #--------------------
|
|
204 my ($out_aref) = @_;
|
|
205
|
|
206 ## Key booleans used in parsing.
|
|
207 my $found_table = 0; # Located the table at top of report (a.k.a. 'descriptions').
|
|
208 my $found_data = 0; # Nothing is done until this is true
|
|
209 my $skip = 0; # Skipping various items in the report header
|
|
210 my $ref_skip = 0; # so we can include nice HTML versions
|
|
211 # (e.g., references for the BLAST program).
|
|
212 my $getNote = 0;
|
|
213 my $getGenBankAlert = 0;
|
|
214 my $str = '';
|
|
215 my $gi_link = \$_gi_link;
|
|
216 my $prog = '';
|
|
217
|
|
218 if( defined($out_aref) and not ref($out_aref) eq 'ARRAY') {
|
|
219 croak("Argument must be an ARRAY ref not a ${\ref $out_aref}.");
|
|
220 }
|
|
221
|
|
222 my $refs = &_prog_ref_html;
|
|
223
|
|
224 &_set_markup_data() if not $_set_markup;
|
|
225
|
|
226 return sub {
|
|
227 # $_ contains a single line from a Blast report.
|
|
228 local $_ = shift;
|
|
229
|
|
230 croak("Report appears to be HTML formatted already.") if m/<HTML>|<TITLE>|<PRE>/i;
|
|
231
|
|
232 if(not $found_table) {
|
|
233 if($ref_skip) {
|
|
234 # Replacing an reference data with special HTML.
|
|
235 $ref_skip = 0 if /^\s+$/;
|
|
236 }
|
|
237 if($getNote) {
|
|
238 ## SAC: created this test since we are no longer reading from STDIN.
|
|
239 $out_aref ? push(@$out_aref, $_) : print $_;
|
|
240 $getNote = 0 if m/^\s+$/;
|
|
241 } elsif( m/(.*), Up \d.*/ or /Date: +(.+)/ or /Start: +(.+?) +End:/ ) {
|
|
242 ### Network BLAST reports from NCBI are time stamped as follows:
|
|
243 #Fri Apr 18 15:55:41 EDT 1997, Up 1 day, 19 mins, 1 user, load: 19.54, 19.13, 17.77
|
|
244 my $date = "<b>BLASTed on:</b> $1<p>\n";
|
|
245 $out_aref ? push(@$out_aref, $date) : print $date;
|
|
246 } elsif ( /^(<\w+>)?(T?BLAST[NPX])\s+(.*?)/ ) {
|
|
247 $found_data = 1;
|
|
248 local($^W) = 0;
|
|
249 s#(\S+)\s+(.*)#<P><B>Program:</B> $1 $2 $3<br>#o;
|
|
250 $out_aref ? push(@$out_aref, $_) : print $_;
|
|
251 $skip = 1;
|
|
252 $prog = $2;
|
|
253 if($prog =~ /BLASTN/) {
|
|
254 ## Prevent the error at Entrez when you ask for a nucl
|
|
255 ## entry with a protein GI number.
|
|
256 $$gi_link = $DbUrl{'gb_n'}; # nucleotide
|
|
257 } else {
|
|
258 $$gi_link = $DbUrl{'gb_p'}; # protein
|
|
259 }
|
|
260 } elsif ( m/^Query=/ ) {
|
|
261 # Keeping the "Query=" format to keep it parsable by Blast.pm
|
|
262 # (after stripping HTML).
|
|
263 s#Query= *(.*)#<title>$1</title>\n<p><b>Query=</b> $1#o;
|
|
264 $out_aref ? push(@$out_aref, $_) : print $_;
|
|
265 $skip = 1;
|
|
266 } elsif ( /Reference:/) {
|
|
267 $ref_skip = 1;
|
|
268 } elsif ( /^Database:/ ) {
|
|
269 &_markup_database(\$_);
|
|
270 $out_aref ? push(@$out_aref, $_) : print $_;
|
|
271 if ( /non-redundant genbank/i and $prog =~ /TBLAST[NX]/i) {
|
|
272 $getGenBankAlert = 1;
|
|
273 }
|
|
274 $skip = 1;
|
|
275 } elsif ( /sequences;/ ) {
|
|
276 $str = "$_<p>";
|
|
277 $out_aref ? push(@$out_aref, $str) : print $str;
|
|
278 } elsif ( /^\s+\(\d+ letters\)\s+/ ) {
|
|
279 $str = "<br>    $_";
|
|
280 $out_aref ? push(@$out_aref, $str) : print $str;
|
|
281 } elsif ( /^(WARNING|NOTICE):/i ) {
|
|
282 s#WARNING: *(.*)#<p><b><font color="red">$1:</font></b> $1#o;
|
|
283 $out_aref ? push(@$out_aref, $_) : print $_;
|
|
284 $getNote = 1;
|
|
285 } elsif ( /Score +E\s*$/ or /Probability\s*$/ ) {
|
|
286 # Put the last HTML-formatted lines before the main body of report.
|
|
287 $found_table = 1;
|
|
288 $skip = 0;
|
|
289 $out_aref ? push(@$out_aref, $refs) : print $refs;
|
|
290 if($getGenBankAlert) {
|
|
291 $str = &_genbank_alert;
|
|
292 $out_aref ? push(@$out_aref, $str) : print $str;
|
|
293 }
|
|
294 $str = "\n<p><pre>";
|
|
295 $out_aref ? push(@$out_aref, $str) : print $str;
|
|
296 }
|
|
297
|
|
298 } else {
|
|
299 &_markup_report(\$_);
|
|
300 }
|
|
301
|
|
302 if ($found_data and not($skip or $ref_skip)) {
|
|
303 $out_aref ? push(@$out_aref, $_) : print $_;
|
|
304 }
|
|
305 1;
|
|
306 } # end sub {}
|
|
307 }
|
|
308
|
|
309
|
|
310
|
|
311
|
|
312 =head2 _set_markup_data
|
|
313
|
|
314 Usage : n/a; utility method used by get_html_func()
|
|
315 Purpose : Sets various hashes and regexps used for adding HTML
|
|
316 : to raw Blast output.
|
|
317 Returns : n/a
|
|
318 Comments : These items need be set only once.
|
|
319
|
|
320 See Also : L<get_html_func()|get_html_func>
|
|
321
|
|
322 =cut
|
|
323
|
|
324 #-------------------
|
|
325 sub _set_markup_data {
|
|
326 #-------------------
|
|
327 %DbUrl = $BioWWW->search_url('all');
|
|
328 %SGDUrl = $BioWWW->sgd_url('all');
|
|
329
|
|
330 $Signif = '[\de.-]{3,}'; # Regexp for a P-value or Expect value.
|
|
331 $Int = ' *\d\d*'; # Regexp for an integer.
|
|
332 $Descrip = ' +.* {2,}?'; # Regexp for a description line.
|
|
333 $Acc = '[A-Z][\d.]+'; # Regexp for GB/EMBL/DDJB/SP accession number
|
|
334 $Pir_acc = '[A-Z][A-Z0-9]{5,}'; # Regexp for PIR accession number
|
|
335 $Word = '[\w_.]+'; # Regexp for a word. Include dot for version.
|
|
336
|
|
337 $_set_markup = 1;
|
|
338 }
|
|
339
|
|
340
|
|
341 =head2 _markup_database
|
|
342
|
|
343 Usage : n/a; utility method used by get_html_func()
|
|
344 Purpose : Converts a cryptic database ID into a readable name.
|
|
345 Returns : n/a
|
|
346 Comments : This is used for converting local database IDs into
|
|
347 : understandable terms. At present, it only recognizes
|
|
348 : databases used locally at SGD.
|
|
349
|
|
350 See Also : L<get_html_func()|get_html_func>
|
|
351
|
|
352 =cut
|
|
353
|
|
354 #---------------------
|
|
355 sub _markup_database {
|
|
356 #---------------------
|
|
357 my $line_ref = shift;
|
|
358 local $_ = $$line_ref;
|
|
359
|
|
360 $_ =~ s#YeastN#<i>S. cerevisiae</i> GenBank Data Set; #;
|
|
361 $_ =~ s#YeastP#Non-Redundant <i>S. cerevisiae</i> Protein Data Set; #;
|
|
362 $_ =~ s#genoSC#Complete DNA Sequence for the S. cerevisiae Genome; #;
|
|
363 $_ =~ s#YeastORF-P#Translation of all Standard S.c. ORFs; #;
|
|
364 $_ =~ s#YeastORF-N#Coding Sequence of all Standard S.c. ORFs; #;
|
|
365 s#Database: *(.*)#<p><b>Database:</b> $1#o;
|
|
366
|
|
367 $$line_ref = $_;
|
|
368 }
|
|
369
|
|
370
|
|
371 =head2 _markup_report
|
|
372
|
|
373 Usage : n/a; utility function used by get_html_func()
|
|
374 Purpose : Adds HTML links to aid navigation of raw Blast output.
|
|
375 Returns : n/a
|
|
376 Comments : HTML-formatting is dependent on the Blast server that
|
|
377 : provided the Blast report. Currently, this function can handle reports
|
|
378 : produced by NCBI and SGD. Feel free to modify this function
|
|
379 : to accomodate reports produced by other servers/sites.
|
|
380 :
|
|
381 : This function is simply a collection of substitution regexps
|
|
382 : that recognize and modify the relevant lines of the Blast report.
|
|
383 : All non-header lines of the report are passed through this function,
|
|
384 : only the ones that match will get modified.
|
|
385 :
|
|
386 : The general scheme for adding links is as follows:
|
|
387 : (Some of the SGD markups do not follow this scheme precisely
|
|
388 : but this is the general trend.)
|
|
389 :
|
|
390 : For description lines in the summary table at the top of report:
|
|
391 :
|
|
392 : DB:SEQUENCE_ID DESCRIPTION SIGNIF_VAL
|
|
393 : DB = links to the indicated database (if not Gen/Embl/Ddbj).
|
|
394 : SEQUENCE_ID = links to GenBank entry for the sequence.
|
|
395 : SIGNIF_VAL = internal link to relevant alignment section.
|
|
396 :
|
|
397 : For the alignment sections in the body of the report:
|
|
398 :
|
|
399 : DB:SEQUENCE_ID (Back | Top) DESCRIPTION
|
|
400 : DB = links to the indicated database (if not Gen/Embl/Ddbj).
|
|
401 : SEQUENCE_ID = links to GenBank entry for the sequence.
|
|
402 : SIGNIF_VAL = internal link to alignment section.
|
|
403 : Back = internal link to description line in summary section.
|
|
404 : Top = internal link to top of page.
|
|
405 :
|
|
406 : 'DB' links are created for PDB, PIR, and SwissProt sequences.
|
|
407 :
|
|
408 : RE_PARSING HTML-FOMRATTED REPORTS:
|
|
409 : ----------------------------------
|
|
410 : HTML-formatted reports generated by this module, as well as reports
|
|
411 : obtained from the NCBI servers, should be parsable
|
|
412 : by Bio::Tools::Blast.pm. Parsing HTML-formatted reports is
|
|
413 : slow, however, since the HTML must be removed prior to parsing.
|
|
414 : Parsing HTML-formatted reports is dependent on the specific structure
|
|
415 : of the HTML and is generally not recommended.
|
|
416 :
|
|
417 : Note that since URLs can change without notice, links will need updating.
|
|
418 : The links are obtained from Bio::Tools::WWW.pm updating that module
|
|
419 : will update this as well.
|
|
420 :
|
|
421 Bugs : Some links to external databases are incorrect
|
|
422 : (in particular, for 'bbs' and 'prf' databases on NCBI Blast reports.
|
|
423 : Some links may fail as a result of the dynamic nature of the web.
|
|
424 : Hypertext links are not added to hits without database ids.
|
|
425
|
|
426 See Also : L<get_html_func()|get_html_func>, B<Bio::Tools::WWW.pm>, L<strip_html>()
|
|
427
|
|
428 =cut
|
|
429
|
|
430 #--------------------
|
|
431 sub _markup_report {
|
|
432 #--------------------
|
|
433 my $line_ref = shift;
|
|
434 local $_ = $$line_ref;
|
|
435 ##
|
|
436 ## REGEXPS FOR ALIGNMENT SECTIONS (within the body of the report,
|
|
437 ## the text above the list of HSPs).
|
|
438 ##
|
|
439 ## If the HSP alignment sections don't start with a '>' we have no way
|
|
440 ## of finding them. This occurs with reports saved from HTML-formatted
|
|
441 ## web pages, which we shouldn't be processing here anyway.
|
|
442
|
|
443 ## To facilitate parsing of HTML-formatted reports by Bio::Tools::Blast.pm,
|
|
444 ## the <a name=...> anchors should be added at the BEGINNING of the HSP
|
|
445 ## alignment section lines and at the END of the description section lines.
|
|
446
|
|
447 # Removing " ! " addded by GCG.
|
|
448 s/ ! / /;
|
|
449
|
|
450 ### NCBI-specific markups for HSP alignment section lines:
|
|
451
|
|
452 local($^W) = 0;
|
|
453
|
|
454 # GenBank/EMBL, DDBJ hits (GenBank Format):
|
|
455 s@^>(gb|emb|dbj|ref)\|($Word)(\|$Word)?(.*)$@<a name=$2_A></a><b>$1:<a href="$_gi_link$2">$2$3</a></b>$4<br>(<a href="\#$2_H">Back|<a href="\#top">Top</a>)@o;
|
|
456
|
|
457 s@^>(gb|emb|dbj|ref)\|($Word)(\| \(?$Word\)?)(.*)$@<a name=$2_A></a><b>$1:<a href="$_gi_link$2">$2</a></b>$3$4<br>(<a href="\#$2_H">Back|<a href="\#top">Top</a>)@o;
|
|
458
|
|
459 # PIR hits
|
|
460 s@^>pir\|\|($Word)( .*)$@<a name=$1_A></a><b><a href=\"$DbUrl{'pir_acc'}$1\">pir</a>:<a href="$DbUrl{'gb_p'}$1">$1</a></b> $2 <br>(<a href="\#$1_H">Back|<a href="\#top">Top</a>)@o;
|
|
461
|
|
462 # GI hits (GenBank Format): using a nested (())
|
|
463 s@^>(gi)\|($Word)( +\(($Word)\))( .*)$@<a name=$4_A></a><b>$1:<a href="$_gi_link$4">$2</a></b>$3$5<br>(<a href="\#$4_H">Back|<a href="\#top">Top</a>)@o;
|
|
464
|
|
465 # GNL PID hits (GenBank Format):
|
|
466 s@^>(gnl)\|($Word)?(\|$Word) +\(($Word)\)( .*)$@<a name=$4_A></a><b>$1:<a href="$_gi_link$4">$2$3</a></b>($4)$5<br>(<a href="\#$4_H">Back|<a href="\#top">Top</a>)@o;
|
|
467
|
|
468 # BBS and PRF hits (what db?) (GenBank Format):
|
|
469 s@^>(bbs|prf)\|\|?($Word)( .*)$@<a name=$2_A></a><b>$1:<a href="$_gi_link$2">$2</a></b>$3<br>(<a href="\#$2_H">Back|<a href="\#top">Top</a>)@o;
|
|
470
|
|
471 # SwissProt hits:
|
|
472 s@^>sp\|($Word)\|($Word)?( .*)$@<a name=$1_A></a><b><a href="$DbUrl{'swpr'}$1">sp</a>:<a href="$DbUrl{'gb_p'}$1">$1|$2</a></b>$3<br>(<a href="\#$1_H">Back|<a href="\#top">Top</a>)@o;
|
|
473
|
|
474
|
|
475 ## PDB ids with or without a chain identifier (GenBank format)
|
|
476 s@^>pdb\|(\d\w{3})\|[\w ] (.*)$@<a name=$1_A></A><b><a href=\"$DbUrl{'3db'}$1\">pdb</A>:<a href="$DbUrl{'gb_struct'}$1">$1</a></b> (<a href="\#$1_H">Back</a>|<a href="\#top">Top</a>) $2@o;
|
|
477
|
|
478
|
|
479 ### SGD-specific markups for HSP alignment section lines:
|
|
480
|
|
481 ## PDB ids without chain identifier
|
|
482 s@^>PDB_UNIQUEP:(\d\w{3})_ (.*)$@<a name=$1_A></A><b><A HREF="$DbUrl{'3db'}$1">PDB</a>:<A HREF="$DbUrl{'gb_struct'}$1">$1</A></b> (<a href="\#$1_H">Back</a>|<a href="\#top">Top</a>) $2@o;
|
|
483
|
|
484 ## PDB ids with chain identifier
|
|
485 s@^>PDB_UNIQUEP:(\d\w{3})_([\w ]{1})(.*)$@<a name=$1_A></A><b><A HREF="$DbUrl{'3db'}$1">PDB</a>:<A HREF="$DbUrl{'gb_struct'}$1">$1</A></b> Chain:$2, (<a href="\#$1_H">Back</a>|<a href="\#top">Top</a>) $3@o;
|
|
486
|
|
487 s@^>($Word)PEPT:GI_(\d+)(.*)$@<a name=$2_A></a><b>$1:<a href="$DbUrl{'gb_p'}$2">GI_$2</a></b> $3 <br>(<a href="\#$2_H">Back|<a href="\#top">Top</a>)@o;
|
|
488
|
|
489 # The gcg blast dataset generating tools up-case all sbjct sequence IDs.
|
|
490 # This is fine for yeast but not worm. This is considered a hack here.
|
|
491 s@WORMPEPT:(\w+\.)(\S+)@WORMPEPT:$1\L$2\E@;
|
|
492
|
|
493 s@^>WORMPEPT:(\S+)(.*)$@<a name=$1_A></a><b>WORMPEP:<A HREF="$DbUrl{'wormace'}$1">$1</a></b> $2 <br>(<a href="\#$1_H">Back|<a href="\#top">Top</a>)@o;
|
|
494
|
|
495 s#^>(GB_$Word):($Word) ($Acc) (.*$)#<a name=$2_$3_A></A><a href=\#$2_$3_H>$2|$3</A>$4\t<b>[<A HREF=$_gi_link$3>GenBank</A> / <A HREF=$DbUrl{'embl'}$3>EMBL</A> / <A HREF=\"$SGDUrl{'seq_an'}$2\*\">SGD</A>]</b> #o;
|
|
496
|
|
497 # Sac's version: ORF name is an external link into SGD:
|
|
498 s@^>ORFP:(\S*) +([\w-]+)(.*$)@<a name=$1_A></A>ORFP:<a href=\"$SGDUrl{'locus'}$2\">$1 $2</A>$3<br>     <b>[<A HREF=\"$SGDUrl{'seq_an'}$2\">Gene/Sequence Resources</a> / <a href=\"$SGDUrl{'map_orf'}$2\">ORF Map</a></b>] <a href="\#$1_H">Back</a>|<a href="\#top">Top</a>@o;
|
|
499
|
|
500 # Mike's version:
|
|
501 # s#^>ORFP:(\S*) (.*$)#<a name=$1_A></A><a href=\#$1_H>ORFP:$1</A> $2\t<b>[<A HREF=\"$SGDUrl{'seq_an'}$1\">Gene/Sequence Resources</a> / <a href=\"$SGDUrl{'map_orf'}$1\">ORF Map</a>]</b> #o;
|
|
502
|
|
503 s#^>ORFN:(\S*) (.*$)#<a name=$1_A></A><a href=\#$1_H>ORFN:$1</A> $2\t<b>[<A HREF=\"$SGDUrl{'seq_an'}$1\">Gene/Sequence Resources</a>] / <a href=\"$SGDUrl{'map_orf'}$1\">ORF Map</a></b> #o;
|
|
504
|
|
505 s#^>NR_SC:GP-\S* gi\|(\w+)([\w\|]*) (.*$)#<a name=$1_A></A><a href=\#$1_H>GenPept|$1</A> gp|$2 $3\t<b>[<A HREF=$DbUrl{'gb_p'}$1>GenPept</A> / <A HREF=\"$SGDUrl{'gi'}$1\*\">SGD</A>]</b> #o;
|
|
506
|
|
507 s#^>NR_SC:SW-$Word SW:($Word) ($Acc) (.*$)#<a name=$1_A></A><a href=\#$1_H>SWISS|$1 $2</A> $3\t<b>[<a href=$DbUrl{'swpr'}$2>SwissProt</a> / <A HREF=$DbUrl{'gb_p'}$2>Entrez</A>]</b>#o;
|
|
508
|
|
509 s#^>NR_SC:PIR-$Word PIR:($Word) (.*$)#<a name=$1_A> </A><a href=\#$1_H>PIR|$1</A> $2\t<b>[<a href=$DbUrl{'pir_uid'}$1>PIR</a> / <A HREF=$DbUrl{'gb_p'}$1>Entrez</A>]</b>#o;
|
|
510
|
|
511 s#^>CHRS:([A-Z][0-9]*) (.*)$#<a name=$1_A></a><a href=\#$1_H>$1</A> $2: [<b><a href=$SGDUrl{'seq_an'}$1>Gene/Sequence Resources</A> / <a href=\"$SGDUrl{'map_chr'}$1\">ORF Map</a></b>]#o;
|
|
512
|
|
513 s#^>NOT:([A-Z]_[0-9]*-[0-9]*)( *)Chromosome ([0-9]*) from ([0-9]*) to ([0-9]*)$#<a name=$1_A></a><a href=\#$1_H>$1</A> $2Chromosome $3 from $4 to $5 [<b><a href=$SGDUrl{'chr'}$3\&beg=$4\&end=$5>Gene/Sequence Resources</a> / <a href=\"$SGDUrl{'map_chr'}$3\&beg=$4\&end=$5\">ORF Map</a> / <a href=\"$SGDUrl{'chr_old'}$3\&beg=$4\&end=$5\">Retrieve DNA</a></b>]#o;
|
|
514
|
|
515 s#^>UTR5_SC_[0-9]*:(\S*) 5' untranslated region, chr(\S*) ([0-9]*) - ([0-9]*)(.*$)#<a name=$1_A></A><a href=\#$1_H>UTR5:$1</A> $1 5' untranslated region, chr$2 $3 - $4, $5\t<b>[<A HREF=\"$SGDUrl{'chr'}$2&beg=$3&end=$4\">Gene/Sequence Resources</A> / <a href=\"$SGDUrl{'map_chr'}$2\&beg=$3\&end=$4\">ORF Map</a>]</b>#o;
|
|
516
|
|
517 # Hits without a db identifier.
|
|
518 # If any of the previous regexps succeed, the leading '>' will be removed.
|
|
519 # Otherwise, this regexp could cause trouble.
|
|
520 s@^>($Word)(.*)$@<a name=$1_A></a>$1 $2<br>(<a href="\#$1_H">Back|<a href="\#top">Top</a>)@o;
|
|
521
|
|
522 ##
|
|
523 ## REGEXPS FOR SUMMARY TABLE LINES AT TOP OF REPORT (a.k.a. 'descriptions')
|
|
524 ## (table of sequence id, description, score, P/Expect value, n)
|
|
525 ##
|
|
526 ## Not using bold face to highlight the sequence id's since this can throw off
|
|
527 ## off formatting of the line when the IDs are different lengths. This lead to
|
|
528 ## the scores and P/Expect values not lining up properly.
|
|
529
|
|
530 ### NCBI-specific markups for description lines:
|
|
531
|
|
532 # GenBank/EMBL, DDBJ hits (GenBank Format):
|
|
533 s@^ ?(gb|emb|dbj|ref)\|($Word)(\|$Word)?($Descrip)($Int +)($Signif)(.*)$@$1:<a href="$_gi_link$2">$2$3</a>$4$5<A href="\#$2_A">$6</a>$7<a name="$2_H"></a>@o;
|
|
534
|
|
535 s@^ ?(gb|emb|dbj|ref)\|($Word)(\| \(?$Word\)?)($Descrip)($Int +)($Signif)(.*)$@$1:<a href="$_gi_link$2">$2</a>$3$4$5<A href="\#$2_A">$6</a>$7<a name="$2_H"></a>@o;
|
|
536
|
|
537 # Missing inner ID
|
|
538 s@^ ?pir\|\|($Word)?($Descrip)($Int) ($Signif)(.*)$@<a href="$DbUrl{'pir_acc'}$1">pir</a>:<a href="$DbUrl{'gb_p'}$1">$1</a> $2$3 <A href="\#$1_A">$4</a>$5<a name="$1_H"></a>@o;
|
|
539
|
|
540 # GI hits (GenBank Format): using a nested (())
|
|
541 s@^ ?gi\|($Word)( +\(($Word)\))($Descrip)($Int) ($Signif)(.*)$@gi:<a href="$_gi_link$3">$1</a>$2$4$5 <A href="\#$3_A">$6</a>$7<a name="$3_H"></a>@o;
|
|
542
|
|
543 s@^ ?(gnl)\|($Word)?(\|$Word +)\(($Word)\)($Descrip)($Int) ($Signif)(.*)$@$1:<a href="$_gi_link$4">$2$3</a>($4)$5$6 <A href="\#$4_A">$7</a>$8<a name="$4_H"></a>@o;
|
|
544
|
|
545
|
|
546 s@^ ?(bbs|prf)\|\|?($Word)($Descrip)($Int) ($Signif)(.*)$@$1:<a href="$_gi_link$2">$2</a> $3$4 <A href="\#$2_A">$5</a>$6<a name="$2_H"></a>@o;
|
|
547
|
|
548
|
|
549 ## SwissProt accessions (GenBank format)
|
|
550 s@^ ?sp\|($Word)(\|$Word)?($Descrip)($Int) ($Signif)(.*)$@<a href="$DbUrl{'swpr'}$1">sp</a>:<a href="$DbUrl{'gb_p'}$1">$1$2</a>$3$4 <a href="\#$1_A">$5</a>$6<a name="$1_H"></a>@o;
|
|
551
|
|
552 ## PDB ids with or without a chain ID (GenBank format)
|
|
553 s@^ ?pdb\|($Word)\|($Word)?($Descrip)($Int) ($Signif)(.*)$@<a href="$DbUrl{'3db'}$1">pdb</a>:<a href="$DbUrl{'gb_struct'}$1">$1_$2</a>$3$4 <a href="\#$1_A">$5</a>$6<a name="$1_H"></a>@o;
|
|
554
|
|
555
|
|
556 ### SGD-specific markups for description lines:
|
|
557
|
|
558 ## PDB ids without chain identifier
|
|
559 s@^ ?PDB_UNIQUEP:(\d\w{3})_($Descrip)($Int) ($Signif)(.*)$@<a href="$DbUrl{'3db'}$1">PDB</a>:<A HREF="$DbUrl{'gb_struct'}$1">$1</A> $2$3 <a href="\#$1_A">$4</a>$5<a name="$1_H"></a>@o;
|
|
560
|
|
561
|
|
562 ## PDB ids with chain identifier
|
|
563 s@^ ?PDB_UNIQUEP:(\d\w{3})_(\w)($Descrip)($Int) ($Signif)(.*)$@<a href="$DbUrl{'3db'}$1">PDB</a>:<A HREF="$DbUrl{'gb_struct'}$1">$1</A> Chain:$2$3$4 <a href="\#$1_A">$5</a>$6<a name="$1_H"></a>@o;
|
|
564
|
|
565
|
|
566 s@^ ?($Word)PEPT:GI_(\d+)($Descrip)($Int) ($Signif)(.*)$@$1:<A HREF="$DbUrl{'gb_p'}$2">GI_$2</A> $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>@o;
|
|
567
|
|
568 s@^ *WORMPEPT:(\S+)($Descrip)($Int) ($Signif)(.*)$@WORMPEP:<A HREF="$DbUrl{'wormace'}$1">$1</a> $2 $3 <a href="\#$1_A">$4</a>$5<a name="$1_H"></a>@o;
|
|
569
|
|
570 ## Mike Cherry's markups. SAC note: added back database name to allow
|
|
571 ## the HTML-formatted version to be parsable by Blast.pm.
|
|
572
|
|
573 s#^ ?(GB_$Word:)($Word)( *)($Acc)($Descrip)($Int) ( *$Signif) ( *\d*)$#GenBank\|<a href="$_gi_link$4">$2</A>\|$4 $3$5$6 <a href="\#$2_$4_A">$7</A> $8<a name="$2_$4_H"></A>#o;
|
|
574
|
|
575 # Mike's version:
|
|
576 # s#^ ?(ORFP:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#$1<b>$2</b> $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>#o;
|
|
577
|
|
578 # My modification:
|
|
579 s@^ ?ORFP:(\S*) +([\w-]+)(.*[ ]{2,3})($Int) ($Signif) ($Int)$@ORFP:<A HREF=\"$SGDUrl{'locus'}$2\">$1 $2</A>$3$4 <a href="\#$1_A">$5</a> $6<a name="$1_H"></a>@o;
|
|
580
|
|
581 s#^ ?(ORFN:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#$1$2 $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>#o;
|
|
582
|
|
583 s#^ ?(NR_SC:GP-)(\S*) ( *)gi\|(\w+)([\w\|]*)($Descrip)($Int) ($Signif) ($Int)$#GenPept\|<a href="$DbUrl{'gb_p'}$4">$4</A>$3 gp|$2 $5$6$7 <a href="\#$4_A">$8</A> $9<a name="$4_H"></A>#o;
|
|
584
|
|
585 s#^ ?(NR_SC:SW-)$Word ( *)SW:($Word) ($Acc)($Descrip)($Int) ($Signif) ($Int)$#SWISS\|<a href="$DbUrl{'swpr'}$4">$3</A> SW:$3 $4 $5$6 <a href="\#$3_A">$7</A> $8<a name="$3_H"></A>#o;
|
|
586
|
|
587 s#^ ?(NR_SC:PIR-)$Word ( *)PIR:($Word)($Descrip)($Int) ($Signif) ($Int)$#PIR\|<a href="$DbUrl{'pir_uid'}$3">$3</A> $2 PIR:$3 $4$5 <a href="\#$3_A">$6</A> $7<a name="$3_H"></A>#o;
|
|
588
|
|
589 s#^ ?(CHRS:)([A-Z][0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1Segment:$2 $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>#o;
|
|
590
|
|
591 s#^ ?(CHR[0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1 $2 $3 <a href="\#$1_A">$4</a> $5<a name="$1_H"></a>#o;
|
|
592
|
|
593 s#^ ?(NOT:)([A-Z]_[0-9]*-[0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1$2 $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>#o;
|
|
594
|
|
595 s#^ ?(UTR5_SC_[0-9]*:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#UTR5:$2 $3 $4 <a href="\#$2_A">$5</a> $6<a name="$2_H"></a>#o;
|
|
596
|
|
597 # Hits without a db identifier.
|
|
598 s@^ ?($Word)($Descrip)($Int) ($Signif)(.*)$@$1$2$3 <A href="\#$1_A">$4</a>$5<a name="$1_H"></a>@o;
|
|
599
|
|
600 $$line_ref = $_;
|
|
601 }
|
|
602
|
|
603
|
|
604
|
|
605
|
|
606 =head2 _prog_ref_html
|
|
607
|
|
608 Usage : n/a; utility method used by get_html_func().
|
|
609 Purpose : Get a special alert for BLAST reports against all of GenBank/EMBL.
|
|
610 Returns : string with HTML
|
|
611
|
|
612 See Also : L<get_html_func()|get_html_func>
|
|
613
|
|
614 =cut
|
|
615
|
|
616 #------------------
|
|
617 sub _prog_ref_html {
|
|
618 #------------------
|
|
619 return <<"QQ_REF_QQ";
|
|
620 <p>
|
|
621 <small>
|
|
622 <b>References:</b>
|
|
623 <ol>
|
|
624 <li>Altschul, Stephen F., Warren Gish, Webb Miller, Eugene W. Myers, and David J. Lipman (1990).
|
|
625 Basic local alignment search tool.
|
|
626 <a href="http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2231712&form=6&db=m&Dopt=r">J. Mol. Biol. 215: 403-10</a>.
|
|
627 <li>Altschul et al. (1997), Gapped BLAST and PSI-BLAST:
|
|
628 a new generation of protein database search programs.
|
|
629 <a href="http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=9254694&form=6&db=m&Dopt=r">Nucl. Acids Res. 25: 3389-3402</a>.
|
|
630 <li><b>Program Descriptions</b>:
|
|
631 <a href="http://www.ncbi.nlm.nih.gov/BLAST/newblast.html">BLAST2</a> |
|
|
632 <a href="http://blast.wustl.edu/">WU-BLAST2</a> |
|
|
633 <a href="http://www.ncbi.nlm.nih.gov/BLAST/blast_help.html">Help Manual</a>
|
|
634 </ol>
|
|
635 <small>
|
|
636 HTML formatting provided by the <a href="${\$BioWWW->home_url('bioperl')}Projects/Blast/">Bioperl Blast module</a>.
|
|
637 </small>
|
|
638 </small>
|
|
639 <p>
|
|
640
|
|
641 QQ_REF_QQ
|
|
642
|
|
643 # Not really a reference for the Blast algorithm itself but an interesting usage.
|
|
644 #<li>Gish, Warren, and David J. States (1993). Identification of protein coding regions by database similarity search.
|
|
645 #<a href="http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=8485583&form=6&db=m&Dopt=r">Nature Genetics 3:266-72</a>.
|
|
646
|
|
647 }
|
|
648
|
|
649
|
|
650 =head2 _genbank_alert
|
|
651
|
|
652 Usage : n/a; utility method used by get_html_func().
|
|
653 Purpose : Get a special alert for BLAST reports against all of GenBank/EMBL.
|
|
654 Returns : string with HTML
|
|
655
|
|
656 See Also : L<get_html_func()|get_html_func>
|
|
657
|
|
658 =cut
|
|
659
|
|
660 #------------------
|
|
661 sub _genbank_alert {
|
|
662 #------------------
|
|
663 return << "QQ_GENBANK_QQ";
|
|
664 <p><b><font color="red">CAUTION: Hits reported on this page may be derived from DNA sequences
|
|
665 that contain more than one gene.
|
|
666 </font>To avoid mis-interpretation, always check database entries
|
|
667 for any sequence of interest to verify that the similarity
|
|
668 occurs within the described sequence. (E.g., A DNA sequence
|
|
669 for gene X as reported in GenBank may contain a 5' or 3'
|
|
670 fragment of coding sequence for a neighboring gene Y, yet will
|
|
671 be listed as gene X, since gene Y had not yet been identified). </b>
|
|
672 QQ_GENBANK_QQ
|
|
673 }
|
|
674
|
|
675
|
|
676
|
|
677 =head2 strip_html
|
|
678
|
|
679 Usage : $boolean = &strip_html( string_ref );
|
|
680 : This method is exported.
|
|
681 Purpose : Removes HTML formatting from a supplied string.
|
|
682 : Attempts to restore the Blast report to enable
|
|
683 : parsing by Bio::Tools::Blast.pm.
|
|
684 Returns : Boolean: true if string was stripped, false if not.
|
|
685 Argument : string_ref = reference to a string containing the whole Blast
|
|
686 : report.
|
|
687 Throws : Croaks if the argument is not a scalar reference.
|
|
688 Comments : Based on code originally written by Alex Dong Li
|
|
689 : (ali@genet.sickkids.on.ca).
|
|
690 : This method does some Blast-specific stripping
|
|
691 : (adds back a '>' character in front of each HSP
|
|
692 : alignment listing).
|
|
693 :
|
|
694 : THIS METHOD IS HIGHLY ERROR-PRONE!
|
|
695 :
|
|
696 : Removal of the HTML tags and accurate reconstitution of the
|
|
697 : non-HTML-formatted report is highly dependent on structure of
|
|
698 : the HTML-formatted version. For example, it assumes that first
|
|
699 : line of each alignment section (HSP listing) starts with a
|
|
700 : <a name=..> anchor tag. This permits the reconstruction of the
|
|
701 : original report in which these lines begin with a ">".
|
|
702 : This is required for parsing.
|
|
703 :
|
|
704 : If the structure of the Blast report itself is not intended to
|
|
705 : be a standard, the structure of the HTML-formatted version
|
|
706 : is even less so. Therefore, the use of this method to
|
|
707 : reconstitute parsable Blast reports from HTML-format versions
|
|
708 : should be considered a temorary solution.
|
|
709
|
|
710 See Also : B<Bio::Tools::Blast::parse()>
|
|
711
|
|
712 =cut
|
|
713
|
|
714 #---------------
|
|
715 sub strip_html {
|
|
716 #---------------
|
|
717 # This may not best way to remove html tags. However, it is simple.
|
|
718 # it won't work under following conditions:
|
|
719 # 1) if quoted > appears in a tag (does this ever happen?)
|
|
720 # 2) if a tag is split over multiple lines and this method is
|
|
721 # used to process one line at a time.
|
|
722
|
|
723 my $string_ref = shift;
|
|
724
|
|
725 ref $string_ref eq 'SCALAR' or
|
|
726 croak ("Can't strip HTML: ".
|
|
727 "Argument is should be a SCALAR reference not a ${\ref $string_ref}");
|
|
728
|
|
729 my $str = $$string_ref;
|
|
730 my $stripped = 0;
|
|
731
|
|
732 # Removing "<a name =...>" and adding the '>' character for
|
|
733 # HSP alignment listings.
|
|
734 $str =~ s/(\A|\n)<a name ?=[^>]+> ?/>/sgi and $stripped = 1;
|
|
735
|
|
736 # Removing all "<>" tags.
|
|
737 $str =~ s/<[^>]+>| //sgi and $stripped = 1;
|
|
738
|
|
739 # Re-uniting any lone '>' characters.
|
|
740 $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1;
|
|
741
|
|
742 $$string_ref = $str;
|
|
743 $stripped;
|
|
744 }
|
|
745
|
|
746 1;
|
|
747 __END__
|
|
748
|
|
749 #####################################################################################
|
|
750 # END OF CLASS #
|
|
751 #####################################################################################
|
|
752
|
|
753
|
|
754
|