0
|
1 =head1 NAME
|
|
2
|
|
3 Bio::Search::BlastUtils - Utility functions for Bio::Search:: BLAST objects
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 This module is just a collection of subroutines, not an object.
|
|
8
|
|
9 =head1 DESCRIPTION
|
|
10
|
|
11 The BlastUtils.pm module is a collection of subroutines used primarily by
|
|
12 Bio::Search::Hit::BlastHit objects for some of the additional
|
|
13 functionality, such as HSP tiling. Right now, the BlastUtils is just a
|
|
14 collection of methods, not an object, and it's tightly coupled to
|
|
15 Bio::Search::Hit::BlastHit. A goal for the future is to generalize it
|
|
16 to work based on the Bio::Search interfaces, then it can work with any
|
|
17 objects that implements them.
|
|
18
|
|
19 =head1 AUTHOR
|
|
20
|
|
21 Steve Chervitz E<lt>sac@bioperl.orgE<gt>
|
|
22
|
|
23 =cut
|
|
24
|
|
25 #'
|
|
26
|
|
27 package Bio::Search::BlastUtils;
|
|
28
|
|
29
|
|
30
|
|
31 =head2 tile_hsps
|
|
32
|
|
33 Usage : tile_hsps( $sbjct );
|
|
34 : This is called automatically by Bio::Search::Hit::BlastHit
|
|
35 : during object construction or
|
|
36 : as needed by methods that rely on having tiled data.
|
|
37 Purpose : Collect statistics about the aligned sequences in a set of HSPs.
|
|
38 : Calculates the following data across all HSPs:
|
|
39 : -- total alignment length
|
|
40 : -- total identical residues
|
|
41 : -- total conserved residues
|
|
42 Returns : n/a
|
|
43 Argument : A Bio::Search::Hit::BlastHit object
|
|
44 Throws : n/a
|
|
45 Comments :
|
|
46 : This method is *strongly* coupled to Bio::Search::Hit::BlastHit
|
|
47 : (it accesses BlastHit data members directly).
|
|
48 : TODO: Re-write this to the Bio::Search::Hit::HitI interface.
|
|
49 :
|
|
50 : This method performs more careful summing of data across
|
|
51 : all HSPs in the Sbjct object. Only HSPs that are in the same strand
|
|
52 : and frame are tiled. Simply summing the data from all HSPs
|
|
53 : in the same strand and frame will overestimate the actual
|
|
54 : length of the alignment if there is overlap between different HSPs
|
|
55 : (often the case).
|
|
56 :
|
|
57 : The strategy is to tile the HSPs and sum over the
|
|
58 : contigs, collecting data separately from overlapping and
|
|
59 : non-overlapping regions of each HSP. To facilitate this, the
|
|
60 : HSP.pm object now permits extraction of data from sub-sections
|
|
61 : of an HSP.
|
|
62 :
|
|
63 : Additional useful information is collected from the results
|
|
64 : of the tiling. It is possible that sub-sequences in
|
|
65 : different HSPs will overlap significantly. In this case, it
|
|
66 : is impossible to create a single unambiguous alignment by
|
|
67 : concatenating the HSPs. The ambiguity may indicate the
|
|
68 : presence of multiple, similar domains in one or both of the
|
|
69 : aligned sequences. This ambiguity is recorded using the
|
|
70 : ambiguous_aln() method.
|
|
71 :
|
|
72 : This method does not attempt to discern biologically
|
|
73 : significant vs. insignificant overlaps. The allowable amount of
|
|
74 : overlap can be set with the overlap() method or with the -OVERLAP
|
|
75 : parameter used when constructing the Blast & Sbjct objects.
|
|
76 :
|
|
77 : For a given hit, both the query and the sbjct sequences are
|
|
78 : tiled independently.
|
|
79 :
|
|
80 : -- If only query sequence HSPs overlap,
|
|
81 : this may suggest multiple domains in the sbjct.
|
|
82 : -- If only sbjct sequence HSPs overlap,
|
|
83 : this may suggest multiple domains in the query.
|
|
84 : -- If both query & sbjct sequence HSPs overlap,
|
|
85 : this suggests multiple domains in both.
|
|
86 : -- If neither query & sbjct sequence HSPs overlap,
|
|
87 : this suggests either no multiple domains in either
|
|
88 : sequence OR that both sequences have the same
|
|
89 : distribution of multiple similar domains.
|
|
90 :
|
|
91 : This method can deal with the special case of when multiple
|
|
92 : HSPs exactly overlap.
|
|
93 :
|
|
94 : Efficiency concerns:
|
|
95 : Speed will be an issue for sequences with numerous HSPs.
|
|
96 :
|
|
97 Bugs : Currently, tile_hsps() does not properly account for
|
|
98 : the number of non-tiled but overlapping HSPs, which becomes a problem
|
|
99 : as overlap() grows. Large values overlap() may thus lead to
|
|
100 : incorrect statistics for some hits. For best results, keep overlap()
|
|
101 : below 5 (DEFAULT IS 2). For more about this, see the "HSP Tiling and
|
|
102 : Ambiguous Alignments" section in L<Bio::Search::Hit::BlastHit>.
|
|
103
|
|
104 See Also : L<_adjust_contigs>(), L<Bio::Search::Hit::BlastHit|Bio::Search::Hit::BlastHit>
|
|
105
|
|
106 =cut
|
|
107
|
|
108 #--------------
|
|
109 sub tile_hsps {
|
|
110 #--------------
|
|
111 my $sbjct = shift;
|
|
112
|
|
113 $sbjct->{'_tile_hsps'} = 1;
|
|
114 $sbjct->{'_gaps_query'} = 0;
|
|
115 $sbjct->{'_gaps_sbjct'} = 0;
|
|
116
|
|
117 ## Simple summation scheme. Valid if there is only one HSP.
|
|
118 if((defined($sbjct->{'_n'}) and $sbjct->{'_n'} == 1) or $sbjct->num_hsps == 1) {
|
|
119 my $hsp = $sbjct->hsp;
|
|
120 $sbjct->{'_length_aln_query'} = $hsp->length('query');
|
|
121 $sbjct->{'_length_aln_sbjct'} = $hsp->length('sbjct');
|
|
122 $sbjct->{'_length_aln_total'} = $hsp->length('total');
|
|
123 ($sbjct->{'_totalIdentical'},$sbjct->{'_totalConserved'}) = $hsp->matches();
|
|
124 $sbjct->{'_gaps_query'} = $hsp->gaps('query');
|
|
125 $sbjct->{'_gaps_sbjct'} = $hsp->gaps('sbjct');
|
|
126
|
|
127 # print "_tile_hsps(): single HSP, easy stats.\n";
|
|
128 return;
|
|
129 } else {
|
|
130 # print STDERR "Sbjct: _tile_hsps: summing multiple HSPs\n";
|
|
131 $sbjct->{'_length_aln_query'} = 0;
|
|
132 $sbjct->{'_length_aln_sbjct'} = 0;
|
|
133 $sbjct->{'_length_aln_total'} = 0;
|
|
134 $sbjct->{'_totalIdentical'} = 0;
|
|
135 $sbjct->{'_totalConserved'} = 0;
|
|
136 }
|
|
137
|
|
138 ## More than one HSP. Must tile HSPs.
|
|
139 # print "\nTiling HSPs for $sbjct\n";
|
|
140 my($hsp, $qstart, $qstop, $sstart, $sstop);
|
|
141 my($frame, $strand, $qstrand, $sstrand);
|
|
142 my(@qcontigs, @scontigs);
|
|
143 my $qoverlap = 0;
|
|
144 my $soverlap = 0;
|
|
145 my $max_overlap = $sbjct->{'_overlap'};
|
|
146
|
|
147 foreach $hsp ($sbjct->hsps()) {
|
|
148 # printf " HSP: %s\n%s\n",$hsp->name, $hsp->str('query');
|
|
149 # printf " Length = %d; Identical = %d; Conserved = %d; Conserved(1-10): %d",$hsp->length, $hsp->length(-TYPE=>'iden'), $hsp->length(-TYPE=>'cons'), $hsp->length(-TYPE=>'cons',-START=>0,-STOP=>10);
|
|
150 ($qstart, $qstop) = $hsp->range('query');
|
|
151 ($sstart, $sstop) = $hsp->range('sbjct');
|
|
152 $frame = $hsp->frame;
|
|
153 $frame = -1 unless defined $frame;
|
|
154 ($qstrand, $sstrand) = $hsp->strand;
|
|
155
|
|
156 my ($qgaps, $sgaps) = $hsp->gaps();
|
|
157 $sbjct->{'_gaps_query'} += $qgaps;
|
|
158 $sbjct->{'_gaps_sbjct'} += $sgaps;
|
|
159
|
|
160 $sbjct->{'_length_aln_total'} += $hsp->length;
|
|
161 ## Collect contigs in the query sequence.
|
|
162 $qoverlap = &_adjust_contigs('query', $hsp, $qstart, $qstop, \@qcontigs, $max_overlap, $frame, $qstrand);
|
|
163
|
|
164 ## Collect contigs in the sbjct sequence (needed for domain data and gapped Blast).
|
|
165 $soverlap = &_adjust_contigs('sbjct', $hsp, $sstart, $sstop, \@scontigs, $max_overlap, $frame, $sstrand);
|
|
166
|
|
167 ## Collect overall start and stop data for query and sbjct over all HSPs.
|
|
168 if(not defined $sbjct->{'_queryStart'}) {
|
|
169 $sbjct->{'_queryStart'} = $qstart;
|
|
170 $sbjct->{'_queryStop'} = $qstop;
|
|
171 $sbjct->{'_sbjctStart'} = $sstart;
|
|
172 $sbjct->{'_sbjctStop'} = $sstop;
|
|
173 } else {
|
|
174 $sbjct->{'_queryStart'} = ($qstart < $sbjct->{'_queryStart'} ? $qstart : $sbjct->{'_queryStart'});
|
|
175 $sbjct->{'_queryStop'} = ($qstop > $sbjct->{'_queryStop'} ? $qstop : $sbjct->{'_queryStop'});
|
|
176 $sbjct->{'_sbjctStart'} = ($sstart < $sbjct->{'_sbjctStart'} ? $sstart : $sbjct->{'_sbjctStart'});
|
|
177 $sbjct->{'_sbjctStop'} = ($sstop > $sbjct->{'_sbjctStop'} ? $sstop : $sbjct->{'_sbjctStop'});
|
|
178 }
|
|
179 }
|
|
180
|
|
181 ## Collect data across the collected contigs.
|
|
182
|
|
183 # print "\nQUERY CONTIGS:\n";
|
|
184 # print " gaps = $sbjct->{'_gaps_query'}\n";
|
|
185
|
|
186 # TODO: Account for strand/frame issue!
|
|
187 # Strategy: collect data on a per strand+frame basis and save the most significant one.
|
|
188 my (%qctg_dat);
|
|
189 foreach(@qcontigs) {
|
|
190 # print " query contig: $_->{'start'} - $_->{'stop'}\n";
|
|
191 # print " iden = $_->{'iden'}; cons = $_->{'cons'}\n";
|
|
192 ($frame, $strand) = ($_->{'frame'}, $_->{'strand'});
|
|
193 $qctg_dat{ "$frame$strand" }->{'length_aln_query'} += $_->{'stop'} - $_->{'start'} + 1;
|
|
194 $qctg_dat{ "$frame$strand" }->{'totalIdentical'} += $_->{'iden'};
|
|
195 $qctg_dat{ "$frame$strand" }->{'totalConserved'} += $_->{'cons'};
|
|
196 $qctg_dat{ "$frame$strand" }->{'qstrand'} = $strand;
|
|
197 }
|
|
198
|
|
199 # Find longest contig.
|
|
200 my @sortedkeys = reverse sort { $qctg_dat{ $a }->{'length_aln_query'} <=> $qctg_dat{ $b }->{'length_aln_query'} } keys %qctg_dat;
|
|
201
|
|
202 # Save the largest to the sbjct:
|
|
203 my $longest = $sortedkeys[0];
|
|
204 $sbjct->{'_length_aln_query'} = $qctg_dat{ $longest }->{'length_aln_query'};
|
|
205 $sbjct->{'_totalIdentical'} = $qctg_dat{ $longest }->{'totalIdentical'};
|
|
206 $sbjct->{'_totalConserved'} = $qctg_dat{ $longest }->{'totalConserved'};
|
|
207 $sbjct->{'_qstrand'} = $qctg_dat{ $longest }->{'qstrand'};
|
|
208
|
|
209 ## Collect data for sbjct contigs. Important for gapped Blast.
|
|
210 ## The totalIdentical and totalConserved numbers will be the same
|
|
211 ## as determined for the query contigs.
|
|
212
|
|
213 # print "\nSBJCT CONTIGS:\n";
|
|
214 # print " gaps = $sbjct->{'_gaps_sbjct'}\n";
|
|
215
|
|
216 my (%sctg_dat);
|
|
217 foreach(@scontigs) {
|
|
218 # print " sbjct contig: $_->{'start'} - $_->{'stop'}\n";
|
|
219 # print " iden = $_->{'iden'}; cons = $_->{'cons'}\n";
|
|
220 ($frame, $strand) = ($_->{'frame'}, $_->{'strand'});
|
|
221 $sctg_dat{ "$frame$strand" }->{'length_aln_sbjct'} += $_->{'stop'} - $_->{'start'} + 1;
|
|
222 $sctg_dat{ "$frame$strand" }->{'frame'} = $frame;
|
|
223 $sctg_dat{ "$frame$strand" }->{'sstrand'} = $strand;
|
|
224 }
|
|
225
|
|
226 @sortedkeys = reverse sort { $sctg_dat{ $a }->{'length_aln_sbjct'} <=> $sctg_dat{ $b }->{'length_aln_sbjct'} } keys %sctg_dat;
|
|
227
|
|
228 # Save the largest to the sbjct:
|
|
229 $longest = $sortedkeys[0];
|
|
230
|
|
231 $sbjct->{'_length_aln_sbjct'} = $sctg_dat{ $longest }->{'length_aln_sbjct'};
|
|
232 $sbjct->{'_frame'} = $sctg_dat{ $longest }->{'frame'};
|
|
233 $sbjct->{'_sstrand'} = $sctg_dat{ $longest }->{'sstrand'};
|
|
234
|
|
235 if($qoverlap) {
|
|
236 if($soverlap) { $sbjct->ambiguous_aln('qs');
|
|
237 # print "\n*** AMBIGUOUS ALIGNMENT: Query and Sbjct\n\n";
|
|
238 }
|
|
239 else { $sbjct->ambiguous_aln('q');
|
|
240 # print "\n*** AMBIGUOUS ALIGNMENT: Query\n\n";
|
|
241 }
|
|
242 } elsif($soverlap) {
|
|
243 $sbjct->ambiguous_aln('s');
|
|
244 # print "\n*** AMBIGUOUS ALIGNMENT: Sbjct\n\n";
|
|
245 }
|
|
246
|
|
247 # Adjust length based on BLAST flavor.
|
|
248 my $prog = $sbjct->algorithm;
|
|
249 if($prog eq 'TBLASTN') {
|
|
250 $sbjct->{'_length_aln_sbjct'} /= 3;
|
|
251 } elsif($prog eq 'BLASTX' ) {
|
|
252 $sbjct->{'_length_aln_query'} /= 3;
|
|
253 } elsif($prog eq 'TBLASTX') {
|
|
254 $sbjct->{'_length_aln_query'} /= 3;
|
|
255 $sbjct->{'_length_aln_sbjct'} /= 3;
|
|
256 }
|
|
257 }
|
|
258
|
|
259
|
|
260
|
|
261 =head2 _adjust_contigs
|
|
262
|
|
263 Usage : n/a; called automatically during object construction.
|
|
264 Purpose : Builds HSP contigs for a given BLAST hit.
|
|
265 : Utility method called by _tile_hsps()
|
|
266 Returns :
|
|
267 Argument :
|
|
268 Throws : Exceptions propagated from Bio::Search::Hit::BlastHSP::matches()
|
|
269 : for invalid sub-sequence ranges.
|
|
270 Status : Experimental
|
|
271 Comments : This method does not currently support gapped alignments.
|
|
272 : Also, it does not keep track of the number of HSPs that
|
|
273 : overlap within the amount specified by overlap().
|
|
274 : This will lead to significant tracking errors for large
|
|
275 : overlap values.
|
|
276
|
|
277 See Also : L<tile_hsps>(), L<Bio::Search::Hit::BlastHSP::matches|Bio::Search::Hit::BlastHSP>
|
|
278
|
|
279 =cut
|
|
280
|
|
281 #-------------------
|
|
282 sub _adjust_contigs {
|
|
283 #-------------------
|
|
284 my ($seqType, $hsp, $start, $stop, $contigs_ref, $max_overlap, $frame, $strand) = @_;
|
|
285
|
|
286 my $overlap = 0;
|
|
287 my ($numID, $numCons);
|
|
288
|
|
289 # print STDERR "Testing $seqType data: HSP (${\$hsp->name}); $start, $stop, strand=$strand, frame=$frame\n";
|
|
290 foreach(@$contigs_ref) {
|
|
291 # print STDERR " Contig: $_->{'start'} - $_->{'stop'}, strand=$_->{'strand'}, frame=$_->{'frame'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n";
|
|
292
|
|
293 # Don't merge things unless they have matching strand/frame.
|
|
294 next unless ($_->{'frame'} == $frame and $_->{'strand'} == $strand);
|
|
295
|
|
296 ## Test special case of a nested HSP. Skip it.
|
|
297 if($start >= $_->{'start'} and $stop <= $_->{'stop'}) {
|
|
298 # print STDERR "----> Nested HSP. Skipping.\n";
|
|
299 $overlap = 1;
|
|
300 next;
|
|
301 }
|
|
302
|
|
303 ## Test for overlap at beginning of contig.
|
|
304 if($start < $_->{'start'} and $stop > ($_->{'start'} + $max_overlap)) {
|
|
305 # print STDERR "----> Overlaps beg: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n";
|
|
306 # Collect stats over the non-overlapping region.
|
|
307 eval {
|
|
308 ($numID, $numCons) = $hsp->matches(-SEQ =>$seqType,
|
|
309 -START =>$start,
|
|
310 -STOP =>$_->{'start'}-1);
|
|
311 };
|
|
312 if($@) { warn "\a\n$@\n"; }
|
|
313 else {
|
|
314 $_->{'start'} = $start; # Assign a new start coordinate to the contig
|
|
315 $_->{'iden'} += $numID; # and add new data to #identical, #conserved.
|
|
316 $_->{'cons'} += $numCons;
|
|
317 $overlap = 1;
|
|
318 }
|
|
319 }
|
|
320
|
|
321 ## Test for overlap at end of contig.
|
|
322 if($stop > $_->{'stop'} and $start < ($_->{'stop'} - $max_overlap)) {
|
|
323 # print STDERR "----> Overlaps end: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n";
|
|
324 # Collect stats over the non-overlapping region.
|
|
325 eval {
|
|
326 ($numID,$numCons) = $hsp->matches(-SEQ =>$seqType,
|
|
327 -START =>$_->{'stop'},
|
|
328 -STOP =>$stop);
|
|
329 };
|
|
330 if($@) { warn "\a\n$@\n"; }
|
|
331 else {
|
|
332 $_->{'stop'} = $stop; # Assign a new stop coordinate to the contig
|
|
333 $_->{'iden'} += $numID; # and add new data to #identical, #conserved.
|
|
334 $_->{'cons'} += $numCons;
|
|
335 $overlap = 1;
|
|
336 }
|
|
337 }
|
|
338 $overlap && do {
|
|
339 # print STDERR " New Contig data:\n";
|
|
340 # print STDERR " Contig: $_->{'start'} - $_->{'stop'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n";
|
|
341 last;
|
|
342 };
|
|
343 }
|
|
344 ## If there is no overlap, add the complete HSP data.
|
|
345 !$overlap && do {
|
|
346 # print STDERR "No overlap. Adding new contig.\n";
|
|
347 ($numID,$numCons) = $hsp->matches(-SEQ=>$seqType);
|
|
348 push @$contigs_ref, {'start'=>$start, 'stop'=>$stop,
|
|
349 'iden'=>$numID, 'cons'=>$numCons,
|
|
350 'strand'=>$strand, 'frame'=>$frame};
|
|
351 };
|
|
352 $overlap;
|
|
353 }
|
|
354
|
|
355 =head2 get_exponent
|
|
356
|
|
357 Usage : &get_exponent( number );
|
|
358 Purpose : Determines the power of 10 exponent of an integer, float,
|
|
359 : or scientific notation number.
|
|
360 Example : &get_exponent("4.0e-206");
|
|
361 : &get_exponent("0.00032");
|
|
362 : &get_exponent("10.");
|
|
363 : &get_exponent("1000.0");
|
|
364 : &get_exponent("e+83");
|
|
365 Argument : Float, Integer, or scientific notation number
|
|
366 Returns : Integer representing the exponent part of the number (+ or -).
|
|
367 : If argument == 0 (zero), return value is "-999".
|
|
368 Comments : Exponents are rounded up (less negative) if the mantissa is >= 5.
|
|
369 : Exponents are rounded down (more negative) if the mantissa is <= -5.
|
|
370
|
|
371 =cut
|
|
372
|
|
373 #------------------
|
|
374 sub get_exponent {
|
|
375 #------------------
|
|
376 my $data = shift;
|
|
377
|
|
378 my($num, $exp) = split /[eE]/, $data;
|
|
379
|
|
380 if( defined $exp) {
|
|
381 $num = 1 if not $num;
|
|
382 $num >= 5 and $exp++;
|
|
383 $num <= -5 and $exp--;
|
|
384 } elsif( $num == 0) {
|
|
385 $exp = -999;
|
|
386 } elsif( not $num =~ /\./) {
|
|
387 $exp = CORE::length($num) -1;
|
|
388 } else {
|
|
389 $exp = 0;
|
|
390 $num .= '0' if $num =~ /\.$/;
|
|
391 my ($c);
|
|
392 my $rev = 0;
|
|
393 if($num !~ /^0/) {
|
|
394 $num = reverse($num);
|
|
395 $rev = 1;
|
|
396 }
|
|
397 do { $c = chop($num);
|
|
398 $c == 0 && $exp++;
|
|
399 } while( $c ne '.');
|
|
400
|
|
401 $exp = -$exp if $num == 0 and not $rev;
|
|
402 $exp -= 1 if $rev;
|
|
403 }
|
|
404 return $exp;
|
|
405 }
|
|
406
|
|
407 =head2 collapse_nums
|
|
408
|
|
409 Usage : @cnums = collapse_nums( @numbers );
|
|
410 Purpose : Collapses a list of numbers into a set of ranges of consecutive terms:
|
|
411 : Useful for condensing long lists of consecutive numbers.
|
|
412 : EXPANDED:
|
|
413 : 1 2 3 4 5 6 10 12 13 14 15 17 18 20 21 22 24 26 30 31 32
|
|
414 : COLLAPSED:
|
|
415 : 1-6 10 12-15 17 18 20-22 24 26 30-32
|
|
416 Argument : List of numbers sorted numerically.
|
|
417 Returns : List of numbers mixed with ranges of numbers (see above).
|
|
418 Throws : n/a
|
|
419
|
|
420 See Also : L<Bio::Search::Hit::BlastHit::seq_inds()|Bio::Search::Hit::BlastHit>
|
|
421
|
|
422 =cut
|
|
423
|
|
424 #------------------
|
|
425 sub collapse_nums {
|
|
426 #------------------
|
|
427 # This is probably not the slickest connectivity algorithm, but will do for now.
|
|
428 my @a = @_;
|
|
429 my ($from, $to, $i, @ca, $consec);
|
|
430
|
|
431 $consec = 0;
|
|
432 for($i=0; $i < @a; $i++) {
|
|
433 not $from and do{ $from = $a[$i]; next; };
|
|
434 if($a[$i] == $a[$i-1]+1) {
|
|
435 $to = $a[$i];
|
|
436 $consec++;
|
|
437 } else {
|
|
438 if($consec == 1) { $from .= ",$to"; }
|
|
439 else { $from .= $consec>1 ? "\-$to" : ""; }
|
|
440 push @ca, split(',', $from);
|
|
441 $from = $a[$i];
|
|
442 $consec = 0;
|
|
443 $to = undef;
|
|
444 }
|
|
445 }
|
|
446 if(defined $to) {
|
|
447 if($consec == 1) { $from .= ",$to"; }
|
|
448 else { $from .= $consec>1 ? "\-$to" : ""; }
|
|
449 }
|
|
450 push @ca, split(',', $from) if $from;
|
|
451
|
|
452 @ca;
|
|
453 }
|
|
454
|
|
455
|
|
456 =head2 strip_blast_html
|
|
457
|
|
458 Usage : $boolean = &strip_blast_html( string_ref );
|
|
459 : This method is exported.
|
|
460 Purpose : Removes HTML formatting from a supplied string.
|
|
461 : Attempts to restore the Blast report to enable
|
|
462 : parsing by Bio::SearchIO::blast.pm
|
|
463 Returns : Boolean: true if string was stripped, false if not.
|
|
464 Argument : string_ref = reference to a string containing the whole Blast
|
|
465 : report containing HTML formatting.
|
|
466 Throws : Croaks if the argument is not a scalar reference.
|
|
467 Comments : Based on code originally written by Alex Dong Li
|
|
468 : (ali@genet.sickkids.on.ca).
|
|
469 : This method does some Blast-specific stripping
|
|
470 : (adds back a '>' character in front of each HSP
|
|
471 : alignment listing).
|
|
472 :
|
|
473 : THIS METHOD IS VERY SENSITIVE TO BLAST FORMATTING CHANGES!
|
|
474 :
|
|
475 : Removal of the HTML tags and accurate reconstitution of the
|
|
476 : non-HTML-formatted report is highly dependent on structure of
|
|
477 : the HTML-formatted version. For example, it assumes that first
|
|
478 : line of each alignment section (HSP listing) starts with a
|
|
479 : <a name=..> anchor tag. This permits the reconstruction of the
|
|
480 : original report in which these lines begin with a ">".
|
|
481 : This is required for parsing.
|
|
482 :
|
|
483 : If the structure of the Blast report itself is not intended to
|
|
484 : be a standard, the structure of the HTML-formatted version
|
|
485 : is even less so. Therefore, the use of this method to
|
|
486 : reconstitute parsable Blast reports from HTML-format versions
|
|
487 : should be considered a temorary solution.
|
|
488
|
|
489 See Also : B<Bio::Search::Processor::BlastIO::new()>
|
|
490
|
|
491 =cut
|
|
492
|
|
493 #--------------------
|
|
494 sub strip_blast_html {
|
|
495 #--------------------
|
|
496 # This may not best way to remove html tags. However, it is simple.
|
|
497 # it won't work under following conditions:
|
|
498 # 1) if quoted > appears in a tag (does this ever happen?)
|
|
499 # 2) if a tag is split over multiple lines and this method is
|
|
500 # used to process one line at a time.
|
|
501
|
|
502 my ($string_ref) = shift;
|
|
503
|
|
504 ref $string_ref eq 'SCALAR' or
|
|
505 croak ("Can't strip HTML: ".
|
|
506 "Argument is should be a SCALAR reference not a ${\ref $string_ref}\n");
|
|
507
|
|
508 my $str = $$string_ref;
|
|
509 my $stripped = 0;
|
|
510
|
|
511 # Removing "<a name =...>" and adding the '>' character for
|
|
512 # HSP alignment listings.
|
|
513 $str =~ s/(\A|\n)<a name ?=[^>]+> ?/>/sgi and $stripped = 1;
|
|
514
|
|
515 # Removing all "<>" tags.
|
|
516 $str =~ s/<[^>]+>| //sgi and $stripped = 1;
|
|
517
|
|
518 # Re-uniting any lone '>' characters.
|
|
519 $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1;
|
|
520
|
|
521 $$string_ref = $str;
|
|
522 $stripped;
|
|
523 }
|
|
524
|
|
525
|
|
526 1;
|
|
527
|
|
528
|