Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/Search/SearchUtils.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 =head1 NAME | |
2 | |
3 Bio::Search::SearchUtils - Utility functions for Bio::Search:: objects | |
4 | |
5 =head1 SYNOPSIS | |
6 | |
7 This module is just a collection of subroutines, not an object. | |
8 | |
9 =head1 DESCRIPTION | |
10 | |
11 The SearchUtils.pm module is a collection of subroutines used primarily by | |
12 Bio::Search::Hit::HitI objects for some of the additional | |
13 functionality, such as HSP tiling. Right now, the SearchUtils is just a | |
14 collection of methods, not an object. | |
15 | |
16 =head1 AUTHOR | |
17 | |
18 Steve Chervitz E<lt>sac@bioperl.orgE<gt> | |
19 | |
20 =cut | |
21 | |
22 #' | |
23 | |
24 package Bio::Search::SearchUtils; | |
25 | |
26 use strict; | |
27 use vars qw($DEBUG); | |
28 $DEBUG = 1; | |
29 | |
30 =head2 tile_hsps | |
31 | |
32 Usage : tile_hsps( $sbjct ); | |
33 : This is called automatically by methods in Bio::Search::Hit::GenericHit | |
34 : that rely on having tiled data. | |
35 Purpose : Collect statistics about the aligned sequences in a set of HSPs. | |
36 : Calculates the following data across all HSPs: | |
37 : -- total alignment length | |
38 : -- total identical residues | |
39 : -- total conserved residues | |
40 Returns : n/a | |
41 Argument : A Bio::Search::Hit::HitI object | |
42 Throws : n/a | |
43 Comments : | |
44 : This method performs more careful summing of data across | |
45 : all HSPs in the Sbjct object. Only HSPs that are in the same strand | |
46 : and frame are tiled. Simply summing the data from all HSPs | |
47 : in the same strand and frame will overestimate the actual | |
48 : length of the alignment if there is overlap between different HSPs | |
49 : (often the case). | |
50 : | |
51 : The strategy is to tile the HSPs and sum over the | |
52 : contigs, collecting data separately from overlapping and | |
53 : non-overlapping regions of each HSP. To facilitate this, the | |
54 : HSP.pm object now permits extraction of data from sub-sections | |
55 : of an HSP. | |
56 : | |
57 : Additional useful information is collected from the results | |
58 : of the tiling. It is possible that sub-sequences in | |
59 : different HSPs will overlap significantly. In this case, it | |
60 : is impossible to create a single unambiguous alignment by | |
61 : concatenating the HSPs. The ambiguity may indicate the | |
62 : presence of multiple, similar domains in one or both of the | |
63 : aligned sequences. This ambiguity is recorded using the | |
64 : ambiguous_aln() method. | |
65 : | |
66 : This method does not attempt to discern biologically | |
67 : significant vs. insignificant overlaps. The allowable amount of | |
68 : overlap can be set with the overlap() method or with the -OVERLAP | |
69 : parameter used when constructing the Hit object. | |
70 : | |
71 : For a given hit, both the query and the sbjct sequences are | |
72 : tiled independently. | |
73 : | |
74 : -- If only query sequence HSPs overlap, | |
75 : this may suggest multiple domains in the sbjct. | |
76 : -- If only sbjct sequence HSPs overlap, | |
77 : this may suggest multiple domains in the query. | |
78 : -- If both query & sbjct sequence HSPs overlap, | |
79 : this suggests multiple domains in both. | |
80 : -- If neither query & sbjct sequence HSPs overlap, | |
81 : this suggests either no multiple domains in either | |
82 : sequence OR that both sequences have the same | |
83 : distribution of multiple similar domains. | |
84 : | |
85 : This method can deal with the special case of when multiple | |
86 : HSPs exactly overlap. | |
87 : | |
88 : Efficiency concerns: | |
89 : Speed will be an issue for sequences with numerous HSPs. | |
90 : | |
91 Bugs : Currently, tile_hsps() does not properly account for | |
92 : the number of non-tiled but overlapping HSPs, which becomes a problem | |
93 : as overlap() grows. Large values overlap() may thus lead to | |
94 : incorrect statistics for some hits. For best results, keep overlap() | |
95 : below 5 (DEFAULT IS 2). For more about this, see the "HSP Tiling and | |
96 : Ambiguous Alignments" section in L<Bio::Search::Hit::GenericHit>. | |
97 | |
98 See Also : L<_adjust_contigs>(), L<Bio::Search::Hit::GenericHit|Bio::Search::Hit::GenericHit> | |
99 | |
100 =cut | |
101 | |
102 #-------------- | |
103 sub tile_hsps { | |
104 #-------------- | |
105 my $sbjct = shift; | |
106 | |
107 $sbjct->tiled_hsps(1); | |
108 $sbjct->gaps('query', 0); | |
109 $sbjct->gaps('hit', 0); | |
110 | |
111 ## Simple summation scheme. Valid if there is only one HSP. | |
112 if( $sbjct->n == 1 or $sbjct->num_hsps == 1) { | |
113 my $hsp = $sbjct->hsp; | |
114 $sbjct->length_aln('query', $hsp->length('query')); | |
115 $sbjct->length_aln('hit', $hsp->length('sbjct')); | |
116 $sbjct->length_aln('total', $hsp->length('total')); | |
117 $sbjct->matches( $hsp->matches() ); | |
118 $sbjct->gaps('query', $hsp->gaps('query')); | |
119 $sbjct->gaps('sbjct', $hsp->gaps('sbjct')); | |
120 | |
121 # print "_tile_hsps(): single HSP, easy stats.\n"; | |
122 return; | |
123 } else { | |
124 # print STDERR "Sbjct: _tile_hsps: summing multiple HSPs\n"; | |
125 $sbjct->length_aln('query', 0); | |
126 $sbjct->length_aln('sbjct', 0); | |
127 $sbjct->length_aln('total', 0); | |
128 $sbjct->matches( 0, 0); | |
129 } | |
130 | |
131 ## More than one HSP. Must tile HSPs. | |
132 # print "\nTiling HSPs for $sbjct\n"; | |
133 my($hsp, $qstart, $qstop, $sstart, $sstop); | |
134 my($frame, $strand, $qstrand, $sstrand); | |
135 my(@qcontigs, @scontigs); | |
136 my $qoverlap = 0; | |
137 my $soverlap = 0; | |
138 my $max_overlap = $sbjct->overlap; | |
139 my $hit_qgaps = 0; | |
140 my $hit_sgaps = 0; | |
141 my $hit_len_aln = 0; | |
142 my %start_stop; | |
143 | |
144 foreach $hsp ($sbjct->hsps()) { | |
145 # printf " HSP: %s\n%s\n",$hsp->name, $hsp->str('query'); | |
146 # printf " Length = %d; Identical = %d; Conserved = %d; Conserved(1-10): %d",$hsp->length, $hsp->length(-TYPE=>'iden'), $hsp->length(-TYPE=>'cons'), $hsp->length(-TYPE=>'cons',-START=>0,-STOP=>10); | |
147 ($qstart, $qstop) = $hsp->range('query'); | |
148 ($sstart, $sstop) = $hsp->range('sbjct'); | |
149 $frame = $hsp->frame; | |
150 $frame = -1 unless defined $frame; | |
151 ($qstrand, $sstrand) = ($hsp->query->strand, | |
152 $hsp->hit->strand); | |
153 | |
154 # Note: No correction for overlap. | |
155 my ($qgaps, $sgaps) = ($hsp->gaps('query'), $hsp->gaps('hit')); | |
156 $hit_qgaps += $qgaps; | |
157 $hit_sgaps += $sgaps; | |
158 $hit_len_aln += $hsp->length; | |
159 | |
160 ## Collect contigs in the query sequence. | |
161 $qoverlap = &_adjust_contigs('query', $hsp, $qstart, $qstop, | |
162 \@qcontigs, $max_overlap, $frame, | |
163 $qstrand); | |
164 | |
165 ## Collect contigs in the sbjct sequence (needed for domain data and gapped Blast). | |
166 $soverlap = &_adjust_contigs('sbjct', $hsp, $sstart, $sstop, | |
167 \@scontigs, $max_overlap, $frame, | |
168 $sstrand); | |
169 | |
170 ## Collect overall start and stop data for query and sbjct over all HSPs. | |
171 if(not defined $start_stop{'qstart'}) { | |
172 $start_stop{'qstart'} = $qstart; | |
173 $start_stop{'qstop'} = $qstop; | |
174 $start_stop{'sstart'} = $sstart; | |
175 $start_stop{'sstop'} = $sstop; | |
176 } else { | |
177 $start_stop{'qstart'} = ($qstart < $start_stop{'qstart'} ? | |
178 $qstart : $start_stop{'qstart'} ); | |
179 $start_stop{'qstop'} = ($qstop > $start_stop{'qstop'} ? | |
180 $qstop : $start_stop{'qstop'} ); | |
181 $start_stop{'sstart'} = ($sstart < $start_stop{'sstart'} ? | |
182 $sstart : $start_stop{'sstart'} ); | |
183 $start_stop{'sstop'} = ($sstop > $start_stop{'sstop'} ? | |
184 $sstop : $start_stop{'sstop'} ); | |
185 } | |
186 } | |
187 | |
188 # Store the collected data in the Hit object | |
189 $sbjct->gaps('query', $hit_qgaps); | |
190 $sbjct->gaps('hit', $hit_sgaps); | |
191 $sbjct->length_aln('total', $hit_len_aln); | |
192 | |
193 $sbjct->start('query',$start_stop{'qstart'}); | |
194 $sbjct->end('query', $start_stop{'qstop'}); | |
195 $sbjct->start('hit', $start_stop{'sstart'}); | |
196 $sbjct->end('hit', $start_stop{'sstop'}); | |
197 | |
198 ## Collect data across the collected contigs. | |
199 | |
200 # print "\nQUERY CONTIGS:\n"; | |
201 # print " gaps = $sbjct->{'_gaps_query'}\n"; | |
202 | |
203 # Account for strand/frame. | |
204 # Strategy: collect data on a per strand+frame basis and save the most significant one. | |
205 my (%qctg_dat); | |
206 foreach(@qcontigs) { | |
207 # print " query contig: $_->{'start'} - $_->{'stop'}\n"; | |
208 # print " iden = $_->{'iden'}; cons = $_->{'cons'}\n"; | |
209 ($frame, $strand) = ($_->{'frame'}, $_->{'strand'}); | |
210 $qctg_dat{ "$frame$strand" }->{'length_aln_query'} += $_->{'stop'} - $_->{'start'} + 1; | |
211 $qctg_dat{ "$frame$strand" }->{'totalIdentical'} += $_->{'iden'}; | |
212 $qctg_dat{ "$frame$strand" }->{'totalConserved'} += $_->{'cons'}; | |
213 $qctg_dat{ "$frame$strand" }->{'qstrand'} = $strand; | |
214 } | |
215 | |
216 # Find longest contig. | |
217 my @sortedkeys = reverse sort { $qctg_dat{ $a }->{'length_aln_query'} <=> $qctg_dat{ $b }->{'length_aln_query'} } keys %qctg_dat; | |
218 | |
219 # Save the largest to the sbjct: | |
220 my $longest = $sortedkeys[0]; | |
221 $sbjct->length_aln('query', $qctg_dat{ $longest }->{'length_aln_query'}); | |
222 $sbjct->matches($qctg_dat{ $longest }->{'totalIdentical'}, | |
223 $qctg_dat{ $longest }->{'totalConserved'}); | |
224 $sbjct->strand('query', $qctg_dat{ $longest }->{'qstrand'}); | |
225 | |
226 ## Collect data for sbjct contigs. Important for gapped Blast. | |
227 ## The totalIdentical and totalConserved numbers will be the same | |
228 ## as determined for the query contigs. | |
229 | |
230 # print "\nSBJCT CONTIGS:\n"; | |
231 # print " gaps = ", $sbjct->gaps('sbjct'), "\n"; | |
232 | |
233 my (%sctg_dat); | |
234 foreach(@scontigs) { | |
235 # print " sbjct contig: $_->{'start'} - $_->{'stop'}\n"; | |
236 # print " iden = $_->{'iden'}; cons = $_->{'cons'}\n"; | |
237 ($frame, $strand) = ($_->{'frame'}, $_->{'strand'}); | |
238 $sctg_dat{ "$frame$strand" }->{'length_aln_sbjct'} += $_->{'stop'} - $_->{'start'} + 1; | |
239 $sctg_dat{ "$frame$strand" }->{'frame'} = $frame; | |
240 $sctg_dat{ "$frame$strand" }->{'sstrand'} = $strand; | |
241 } | |
242 | |
243 @sortedkeys = reverse sort { $sctg_dat{ $a }->{'length_aln_sbjct'} <=> $sctg_dat{ $b }->{'length_aln_sbjct'} } keys %sctg_dat; | |
244 | |
245 # Save the largest to the sbjct: | |
246 $longest = $sortedkeys[0]; | |
247 | |
248 $sbjct->length_aln('sbjct', $sctg_dat{ $longest }->{'length_aln_sbjct'}); | |
249 $sbjct->frame( $sctg_dat{ $longest }->{'frame'} ); | |
250 $sbjct->strand('hit', $sctg_dat{ $longest }->{'sstrand'}); | |
251 | |
252 if($qoverlap) { | |
253 if($soverlap) { $sbjct->ambiguous_aln('qs'); | |
254 # print "\n*** AMBIGUOUS ALIGNMENT: Query and Sbjct\n\n"; | |
255 } | |
256 else { $sbjct->ambiguous_aln('q'); | |
257 # print "\n*** AMBIGUOUS ALIGNMENT: Query\n\n"; | |
258 } | |
259 } elsif($soverlap) { | |
260 $sbjct->ambiguous_aln('s'); | |
261 # print "\n*** AMBIGUOUS ALIGNMENT: Sbjct\n\n"; | |
262 } | |
263 | |
264 # Adjust length based on BLAST flavor. | |
265 my $prog = $sbjct->algorithm; | |
266 if($prog eq 'TBLASTN') { | |
267 $sbjct->length_aln('sbjct', $sbjct->length_aln('sbjct')/3); | |
268 } elsif($prog eq 'BLASTX' ) { | |
269 $sbjct->length_aln('query', $sbjct->length_aln('query')/3); | |
270 } elsif($prog eq 'TBLASTX') { | |
271 $sbjct->length_aln('query', $sbjct->length_aln('query')/3); | |
272 $sbjct->length_aln('sbjct', $sbjct->length_aln('sbjct')/3); | |
273 } | |
274 } | |
275 | |
276 | |
277 | |
278 =head2 _adjust_contigs | |
279 | |
280 Usage : n/a; called automatically during object construction. | |
281 Purpose : Builds HSP contigs for a given BLAST hit. | |
282 : Utility method called by _tile_hsps() | |
283 Returns : | |
284 Argument : | |
285 Throws : Exceptions propagated from Bio::Search::Hit::BlastHSP::matches() | |
286 : for invalid sub-sequence ranges. | |
287 Status : Experimental | |
288 Comments : This method does not currently support gapped alignments. | |
289 : Also, it does not keep track of the number of HSPs that | |
290 : overlap within the amount specified by overlap(). | |
291 : This will lead to significant tracking errors for large | |
292 : overlap values. | |
293 | |
294 See Also : L<tile_hsps>(), L<Bio::Search::Hit::BlastHSP::matches|Bio::Search::Hit::BlastHSP> | |
295 | |
296 =cut | |
297 | |
298 #------------------- | |
299 sub _adjust_contigs { | |
300 #------------------- | |
301 my ($seqType, $hsp, $start, $stop, $contigs_ref, | |
302 $max_overlap, $frame, $strand) = @_; | |
303 | |
304 my $overlap = 0; | |
305 my ($numID, $numCons); | |
306 | |
307 # printf STDERR "Testing $seqType data: HSP (%s); $start, $stop, strand=$strand, frame=$frame\n", $hsp->$seqType()->seq_id if $DEBUG; | |
308 | |
309 foreach ( @$contigs_ref) { | |
310 # print STDERR " Contig: $_->{'start'} - $_->{'stop'}, strand=$_->{'strand'}, frame=$_->{'frame'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n" if $DEBUG; | |
311 # Don't merge things unless they have matching strand/frame. | |
312 next unless ($_->{'frame'} == $frame and $_->{'strand'} == $strand); | |
313 | |
314 ## Test special case of a nested HSP. Skip it. | |
315 if($start >= $_->{'start'} and $stop <= $_->{'stop'}) { | |
316 # print STDERR "----> Nested HSP. Skipping.\n"; | |
317 $overlap = 1; | |
318 next; | |
319 } | |
320 | |
321 ## Test for overlap at beginning of contig. | |
322 if($start < $_->{'start'} and $stop > ($_->{'start'} + $max_overlap)) { | |
323 # print STDERR "----> Overlaps beg: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n"; | |
324 # Collect stats over the non-overlapping region. | |
325 eval { | |
326 ($numID, $numCons) = $hsp->matches(-SEQ =>$seqType, | |
327 -START =>$start, | |
328 -STOP =>$_->{'start'}-1); | |
329 }; | |
330 if($@) { warn "\a\n$@\n"; } | |
331 else { | |
332 $_->{'start'} = $start; # Assign a new start coordinate to the contig | |
333 $_->{'iden'} += $numID; # and add new data to #identical, #conserved. | |
334 $_->{'cons'} += $numCons; | |
335 $overlap = 1; | |
336 } | |
337 } | |
338 | |
339 ## Test for overlap at end of contig. | |
340 if($stop > $_->{'stop'} and | |
341 $start < ($_->{'stop'} - $max_overlap)) { | |
342 # print STDERR "----> Overlaps end: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n"; | |
343 # Collect stats over the non-overlapping region. | |
344 eval { | |
345 ($numID,$numCons) = $hsp->matches(-SEQ =>$seqType, | |
346 -START =>$_->{'stop'}, | |
347 -STOP =>$stop); | |
348 }; | |
349 if($@) { warn "\a\n$@\n"; } | |
350 else { | |
351 $_->{'stop'} = $stop; # Assign a new stop coordinate to the contig | |
352 $_->{'iden'} += $numID; # and add new data to #identical, #conserved. | |
353 $_->{'cons'} += $numCons; | |
354 $overlap = 1; | |
355 } | |
356 } | |
357 $overlap && do { | |
358 # print STDERR " New Contig data:\n"; | |
359 # print STDERR " Contig: $_->{'start'} - $_->{'stop'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n"; | |
360 last; | |
361 }; | |
362 } | |
363 ## If there is no overlap, add the complete HSP data. | |
364 !$overlap && do { | |
365 # print STDERR "No overlap. Adding new contig.\n"; | |
366 ($numID,$numCons) = $hsp->matches(-SEQ=>$seqType); | |
367 push @$contigs_ref, {'start'=>$start, 'stop'=>$stop, | |
368 'iden'=>$numID, 'cons'=>$numCons, | |
369 'strand'=>$strand, 'frame'=>$frame}; | |
370 }; | |
371 $overlap; | |
372 } | |
373 | |
374 =head2 get_exponent | |
375 | |
376 Usage : &get_exponent( number ); | |
377 Purpose : Determines the power of 10 exponent of an integer, float, | |
378 : or scientific notation number. | |
379 Example : &get_exponent("4.0e-206"); | |
380 : &get_exponent("0.00032"); | |
381 : &get_exponent("10."); | |
382 : &get_exponent("1000.0"); | |
383 : &get_exponent("e+83"); | |
384 Argument : Float, Integer, or scientific notation number | |
385 Returns : Integer representing the exponent part of the number (+ or -). | |
386 : If argument == 0 (zero), return value is "-999". | |
387 Comments : Exponents are rounded up (less negative) if the mantissa is >= 5. | |
388 : Exponents are rounded down (more negative) if the mantissa is <= -5. | |
389 | |
390 =cut | |
391 | |
392 #------------------ | |
393 sub get_exponent { | |
394 #------------------ | |
395 my $data = shift; | |
396 | |
397 my($num, $exp) = split /[eE]/, $data; | |
398 | |
399 if( defined $exp) { | |
400 $num = 1 if not $num; | |
401 $num >= 5 and $exp++; | |
402 $num <= -5 and $exp--; | |
403 } elsif( $num == 0) { | |
404 $exp = -999; | |
405 } elsif( not $num =~ /\./) { | |
406 $exp = CORE::length($num) -1; | |
407 } else { | |
408 $exp = 0; | |
409 $num .= '0' if $num =~ /\.$/; | |
410 my ($c); | |
411 my $rev = 0; | |
412 if($num !~ /^0/) { | |
413 $num = reverse($num); | |
414 $rev = 1; | |
415 } | |
416 do { $c = chop($num); | |
417 $c == 0 && $exp++; | |
418 } while( $c ne '.'); | |
419 | |
420 $exp = -$exp if $num == 0 and not $rev; | |
421 $exp -= 1 if $rev; | |
422 } | |
423 return $exp; | |
424 } | |
425 | |
426 =head2 collapse_nums | |
427 | |
428 Usage : @cnums = collapse_nums( @numbers ); | |
429 Purpose : Collapses a list of numbers into a set of ranges of consecutive terms: | |
430 : Useful for condensing long lists of consecutive numbers. | |
431 : EXPANDED: | |
432 : 1 2 3 4 5 6 10 12 13 14 15 17 18 20 21 22 24 26 30 31 32 | |
433 : COLLAPSED: | |
434 : 1-6 10 12-15 17 18 20-22 24 26 30-32 | |
435 Argument : List of numbers sorted numerically. | |
436 Returns : List of numbers mixed with ranges of numbers (see above). | |
437 Throws : n/a | |
438 | |
439 See Also : L<Bio::Search::Hit::BlastHit::seq_inds()|Bio::Search::Hit::BlastHit> | |
440 | |
441 =cut | |
442 | |
443 #------------------ | |
444 sub collapse_nums { | |
445 #------------------ | |
446 # This is probably not the slickest connectivity algorithm, but will do for now. | |
447 my @a = @_; | |
448 my ($from, $to, $i, @ca, $consec); | |
449 | |
450 $consec = 0; | |
451 for($i=0; $i < @a; $i++) { | |
452 not $from and do{ $from = $a[$i]; next; }; | |
453 if($a[$i] == $a[$i-1]+1) { | |
454 $to = $a[$i]; | |
455 $consec++; | |
456 } else { | |
457 if($consec == 1) { $from .= ",$to"; } | |
458 else { $from .= $consec>1 ? "\-$to" : ""; } | |
459 push @ca, split(',', $from); | |
460 $from = $a[$i]; | |
461 $consec = 0; | |
462 $to = undef; | |
463 } | |
464 } | |
465 if(defined $to) { | |
466 if($consec == 1) { $from .= ",$to"; } | |
467 else { $from .= $consec>1 ? "\-$to" : ""; } | |
468 } | |
469 push @ca, split(',', $from) if $from; | |
470 | |
471 @ca; | |
472 } | |
473 | |
474 | |
475 =head2 strip_blast_html | |
476 | |
477 Usage : $boolean = &strip_blast_html( string_ref ); | |
478 : This method is exported. | |
479 Purpose : Removes HTML formatting from a supplied string. | |
480 : Attempts to restore the Blast report to enable | |
481 : parsing by Bio::SearchIO::blast.pm | |
482 Returns : Boolean: true if string was stripped, false if not. | |
483 Argument : string_ref = reference to a string containing the whole Blast | |
484 : report containing HTML formatting. | |
485 Throws : Croaks if the argument is not a scalar reference. | |
486 Comments : Based on code originally written by Alex Dong Li | |
487 : (ali@genet.sickkids.on.ca). | |
488 : This method does some Blast-specific stripping | |
489 : (adds back a '>' character in front of each HSP | |
490 : alignment listing). | |
491 : | |
492 : THIS METHOD IS VERY SENSITIVE TO BLAST FORMATTING CHANGES! | |
493 : | |
494 : Removal of the HTML tags and accurate reconstitution of the | |
495 : non-HTML-formatted report is highly dependent on structure of | |
496 : the HTML-formatted version. For example, it assumes that first | |
497 : line of each alignment section (HSP listing) starts with a | |
498 : <a name=..> anchor tag. This permits the reconstruction of the | |
499 : original report in which these lines begin with a ">". | |
500 : This is required for parsing. | |
501 : | |
502 : If the structure of the Blast report itself is not intended to | |
503 : be a standard, the structure of the HTML-formatted version | |
504 : is even less so. Therefore, the use of this method to | |
505 : reconstitute parsable Blast reports from HTML-format versions | |
506 : should be considered a temorary solution. | |
507 | |
508 See Also : B<Bio::Search::Processor::BlastIO::new()> | |
509 | |
510 =cut | |
511 | |
512 #-------------------- | |
513 sub strip_blast_html { | |
514 #-------------------- | |
515 # This may not best way to remove html tags. However, it is simple. | |
516 # it won't work under following conditions: | |
517 # 1) if quoted > appears in a tag (does this ever happen?) | |
518 # 2) if a tag is split over multiple lines and this method is | |
519 # used to process one line at a time. | |
520 | |
521 my ($string_ref) = shift; | |
522 | |
523 ref $string_ref eq 'SCALAR' or | |
524 croak ("Can't strip HTML: ". | |
525 "Argument is should be a SCALAR reference not a ${\ref $string_ref}\n"); | |
526 | |
527 my $str = $$string_ref; | |
528 my $stripped = 0; | |
529 | |
530 # Removing "<a name =...>" and adding the '>' character for | |
531 # HSP alignment listings. | |
532 $str =~ s/(\A|\n)<a name ?=[^>]+> ?/>/sgi and $stripped = 1; | |
533 | |
534 # Removing all "<>" tags. | |
535 $str =~ s/<[^>]+>| //sgi and $stripped = 1; | |
536 | |
537 # Re-uniting any lone '>' characters. | |
538 $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1; | |
539 | |
540 $$string_ref = $str; | |
541 $stripped; | |
542 } | |
543 | |
544 | |
545 1; | |
546 | |
547 |