comparison variant_effect_predictor/Bio/Assembly/ContigAnalysis.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: ContigAnalysis.pm,v 1.2 2002/12/01 00:03:28 jason Exp $
2 #
3 # BioPerl module for Bio::Assembly::ContigAnalysis
4 #
5 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br>
6 #
7 # Copyright Robson Francisco de Souza
8 #
9 # You may distribute this module under the same terms as perl itself
10 #
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Assembly::ContigAnalysis -
16 Perform analysis on sequence assembly contigs.
17
18 =head1 SYNOPSIS
19
20 # Module loading
21 use Bio::Assembly::ContigAnalysis;
22
23 # Assembly loading methods
24 my $ca = new Bio::Assembly::ContigAnalysis( -contig=>$contigOBJ );
25
26 my @lcq = $ca->low_consensus_quality;
27 my @hqd = $ca->high_quality_discrepancies;
28 my @ss = $ca->single_strand_regions;
29
30 =head1 DESCRIPTION
31
32 A contig is as a set of sequences, locally aligned to each other, when
33 the sequences in a pair may be aligned. It may also include a
34 consensus sequence. Bio::Assembly::ContigAnalysis is a module
35 holding a collection of methods to analyze contig objects. It was
36 developed around the Bio::Assembly::Contig implementation of contigs and
37 can not work with another contig interface.
38
39 =head1 FEEDBACK
40
41 =head2 Mailing Lists
42
43 User feedback is an integral part of the evolution of this and other
44 Bioperl modules. Send your comments and suggestions preferably to the
45 Bioperl mailing lists Your participation is much appreciated.
46
47 bioperl-l@bioperl.org - General discussion
48 http://bio.perl.org/MailList.html - About the mailing lists
49
50 =head2 Reporting Bugs
51
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 the bugs and their resolution. Bug reports can be submitted via email
54 or the web:
55
56 bioperl-bugs@bio.perl.org
57 http://bugzilla.bioperl.org/
58
59 =head1 AUTHOR - Robson Francisco de Souza
60
61 Email: rfsouza@citri.iq.usp.br
62
63 =head1 APPENDIX
64
65 The rest of the documentation details each of the object
66 methods. Internal methods are usually preceded with a _
67
68 =cut
69
70 package Bio::Assembly::ContigAnalysis;
71
72 use Bio::Root::Root;
73 use strict;
74 use vars qw(@ISA);
75
76 @ISA = qw(Bio::Root::Root);
77
78 =head1 Object creator
79
80 =head2 new
81
82 Title : new
83 Usage : my $contig = Bio::Assembly::ContigAnalysis->new(-contig=>$contigOBJ);
84 Function : Creates a new contig analysis object
85 Returns : Bio::Assembly::ContigAnalysis
86 Args :
87 -contig : a Bio::Assembly::Contig object
88
89 =cut
90
91 sub new {
92 my($class,@args) = @_;
93 my $self = $class->SUPER::new(@args);
94
95 my ($contigOBJ) = $self->_rearrange([qw(CONTIG)],@args);
96 unless ($contigOBJ->isa("Bio::Assembly::Contig")) {
97 $self->throw("ContigAnal works only on Bio::Assembly::Contig objects\n");
98 }
99
100 $self->{'_objref'} = $contigOBJ;
101 return $self;
102 }
103
104 =head1 Analysis methods
105
106 =head2 high_quality_discrepancies
107
108 Title : high_quality_discrepancies
109 Usage : my $sfc = $ContigAnal->high_quality_discrepancies();
110 Function :
111
112 Locates all high quality discrepancies among aligned
113 sequences and the consensus sequence.
114
115 Note: see Bio::Assembly::Contig POD documentation,
116 section "Coordinate System", for a definition of
117 available types. Default coordinate system type is
118 "gapped consensus", i.e. consensus sequence (with gaps)
119 coordinates. If limits are not specified, the entire
120 alignment is analyzed.
121
122 Returns : Bio::SeqFeature::Collection
123 Args : optional arguments are
124 -threshold : cutoff value for low quality (minimum high quality)
125 Default: 40
126 -ignore : number of bases that will not be analysed at
127 both ends of contig aligned elements
128 Default: 5
129 -start : start of interval that will be analyzed
130 -end : start of interval that will be analyzed
131 -type : coordinate system type for interval
132
133 =cut
134
135 sub high_quality_discrepancies {
136 my ($self,@args) = shift; # Package reference
137
138 my ($threshold,$ignore,$start,$end,$type) =
139 $self->_rearrange([qw(THRESHOLD IGNORE START END TYPE)],@args);
140
141 # Defining default threhold and HQD_ignore
142 $threshold = 40 unless (defined($threshold));
143 $ignore = 5 unless (defined($ignore));
144 $type = 'gapped consensus' unless (defined($type));
145
146 # Changing input coordinates system (if needed)
147 if (defined $start && $type ne 'gapped consensus') {
148 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start);
149 } elsif (!defined $start) {
150 $start = 1;
151 }
152 if (defined $end && $type ne 'gapped consensus') {
153 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end);
154 } elsif (!defined $end) {
155 $end = $self->{'_objref'}->get_consensus_length();
156 }
157
158 # Scanning each read sequence and the contig sequence and
159 # adding discrepancies to Bio::SeqFeature::Collection
160 my @seqIDs = $self->{'_objref'}->get_seq_ids(-start=>$start,
161 -end=>$end,
162 -type=>$type);
163 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq;
164
165 my @HQD = ();
166 foreach my $seqID (@seqIDs) {
167 # Setting aligned read sub-sequence limits and loading data
168 my $seq = $self->{'_objref'}->get_seq_by_name($seqID);
169 my $qual = $self->{'_objref'}->get_qual_by_name($seqID);
170 unless (defined $qual) {
171 $self->warn("Can't correctly evaluate HQD without aligned sequence qualities for $seqID");
172 next;
173 }
174 my $sequence = $seq->seq;
175 my @quality = @{ $qual->qual };
176
177 # Scanning the aligned region of each read
178 my $seq_ix = 0;
179 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID");
180 my ($astart,$aend) = ($coord->start,$coord->end);
181 $astart = $astart + $ignore; # Redefining limits to evaluate HQDs (jump $ignore at start)
182 $aend = $aend - $ignore; # Redefining limits to evaluate HQDs (stop $ignore before end)
183
184 my ($d_start,$d_end,$i);
185 for ($i=$astart-1; $i<=$aend-1; $i++) {
186 # Changing coordinate $i+1 from 'gapped consensus' mode to "aligned $seqID" (coordinate $seq_ix)
187 $seq_ix = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$i+1);
188 next unless (($i >= $start) && ($i <= $end));
189
190 my $r_base = uc(substr($sequence,$seq_ix-1,1));
191 my $c_base = uc(substr($consensus,$i,1));
192
193 # Discrepant region start: store $d_start and $type
194 (!defined($d_start) &&
195 ($r_base ne $c_base) &&
196 ($quality[$seq_ix-1] >= $threshold)) && do {
197 $d_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i+1);
198 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n";
199 next;
200 };
201
202 # Quality change or end of discrepant region: store limits and undef $d_start
203 if (defined($d_start) &&
204 (($quality[$seq_ix-1] < $threshold) ||
205 (uc($r_base) eq uc($c_base)))) {
206 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
207 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n";
208 push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID",
209 -start=>$d_start,
210 -end=>$d_end,
211 -strand=>$seq->strand()) );
212 $d_start = undef;
213 next;
214 }
215 } # for ($i=$astart-1; $i<=$aend-1; $i++)
216
217 # Loading discrepancies located at sub-sequence end, if any.
218 if (defined($d_start)) {
219 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
220 push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID",
221 -start=>$d_start,
222 -end=>$d_end,
223 -strand=>$seq->strand()) );
224 }
225 } # foreach my $seqID (@seqIDs)
226
227 return @HQD;
228 }
229
230 =head2 low_consensus_quality
231
232 Title : low_consensus_quality
233 Usage : my $sfc = $ContigAnal->low_consensus_quality();
234 Function : Locates all low quality regions in the consensus
235 Returns : an array of Bio::SeqFeature::Generic objects
236 Args : optional arguments are
237 -threshold : cutoff value for low quality (minimum high quality)
238 Default: 25
239 -start : start of interval that will be analyzed
240 -end : start of interval that will be analyzed
241 -type : coordinate system type for interval
242
243 =cut
244
245 sub low_consensus_quality {
246 my ($self,@args) = shift; # Packege reference
247
248 my ($threshold,$start,$end,$type) =
249 $self->_rearrange([qw(THRESHOLD START END TYPE)],@args);
250
251 # Setting default value for threshold
252 $threshold = 25 unless (defined($threshold));
253
254 # Loading qualities
255 my @quality = @{ $self->{'_objref'}->get_consensus_quality()->qual };
256
257 # Changing coordinates to gap mode noaln (consed: consensus without alignments)
258 $start = 1 unless (defined($start));
259 if (defined $start && defined $type && ($type ne 'gapped consensus')) {
260 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start);
261 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
262 }
263 $end = $self->{'_objref'}->get_consensus_length unless (defined $end);
264
265 # Scanning @quality vector and storing intervals limits with base qualities less then
266 # the threshold value
267 my ($lcq_start);
268 my ($i,@LCQ);
269 for ($i=$start-1; $i<=$end-1; $i++) {
270 # print $quality[$i],"\t",$i,"\n";
271 if (!defined($lcq_start) && (($quality[$i] <= $threshold) || ($quality[$i] == 98))) {
272 $lcq_start = $i+1;
273 } elsif (defined($lcq_start) && ($quality[$i] > $threshold)) {
274 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start);
275 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
276 push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start,
277 -end=>$lcq_end,
278 -primary=>'low_consensus_quality') );
279 $lcq_start = undef;
280 }
281 }
282
283 if (defined $lcq_start) {
284 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start);
285 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
286 push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start,
287 -end=>$lcq_end,
288 -primary=>'low_consensus_quality') );
289 }
290
291 return @LCQ;
292 }
293
294 =head2 not_confirmed_on_both_strands
295
296 Title : low_quality_consensus
297 Usage : my $sfc = $ContigAnal->low_quality_consensus();
298 Function :
299
300 Locates all regions whose consensus bases were not
301 confirmed by bases from sequences aligned in both
302 orientations, i.e., in such regions, no bases in aligned
303 sequences of either +1 or -1 strand agree with the
304 consensus bases.
305
306 Returns : an array of Bio::SeqFeature::Generic objects
307 Args : optional arguments are
308 -start : start of interval that will be analyzed
309 -end : start of interval that will be analyzed
310 -type : coordinate system type for interval
311
312 =cut
313
314 sub not_confirmed_on_both_strands {
315 my ($self,@args) = shift; # Package reference
316
317 my ($start,$end,$type) =
318 $self->_rearrange([qw(START END TYPE)],@args);
319
320 # Changing coordinates to default system 'align' (contig sequence with alignments)
321 $start = 1 unless (defined($start));
322 if (defined($type) && ($type ne 'gapped consensus')) {
323 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start);
324 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
325 }
326 $end = $self->{'_objref'}->get_consensus_length unless (defined($end));
327
328 # Scanning alignment
329 my %confirmed = (); # If ($confirmed{$orientation}[$i] > 0) then $i is confirmed in $orientation strand
330 my ($i);
331 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq;
332 foreach my $seqID ($self->{'_objref'}->get_seq_ids) {
333 # Setting aligned read sub-sequence limits and loading data
334 my $seq = $self->{'_objref'}->get_seq_by_name($seqID);
335 my $sequence = $seq->seq;
336
337 # Scanning the aligned regions of each read and registering confirmed sites
338 my $contig_ix = 0;
339 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID");
340 my ($astart,$aend,$orientation) = ($coord->start,$coord->end,$coord->strand);
341 $astart = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$astart);
342 $aend = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$aend);
343 for ($i=$astart-1; $i<=$aend-1; $i++) {
344 # $i+1 in 'align' mode is $contig_ix
345 $contig_ix = $self->{'_objref'}->change_coord("aligned $seqID",'gapped consensus',$i+1);
346 next unless (($contig_ix >= $start) && ($contig_ix <= $end));
347 my $r_base = uc(substr($sequence,$i,1));
348 my $c_base = uc(substr($consensus,$contig_ix-1,1));
349 if ($c_base eq '-') {
350 $confirmed{$orientation}[$contig_ix] = -1;
351 } elsif (uc($r_base) eq uc($c_base)) { # Non discrepant region found
352 $confirmed{$orientation}[$contig_ix]++;
353 }
354 } # for ($i=$astart-1; $i<=$aend-1; $i++)
355 } # foreach $seqID (@reads)
356
357 # Locating non-confirmed aligned regions for each orientation in $confirmed registry
358 my ($orientation);
359 my @NCBS = ();
360 foreach $orientation (keys %confirmed) {
361 my ($ncbs_start,$ncbs_end);
362
363 for ($i=$start; $i<=$end; $i++) {
364 if (!defined($ncbs_start) &&
365 (!defined($confirmed{$orientation}[$i]) || ($confirmed{$orientation}[$i] == 0))) {
366 $ncbs_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
367 } elsif (defined($ncbs_start) &&
368 defined($confirmed{$orientation}[$i]) &&
369 ($confirmed{$orientation}[$i] > 0)) {
370 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i-1);
371 push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start,
372 -end=>$ncbs_end,
373 -strand=>$orientation,
374 -primary=>"not_confirmed_on_both_strands") );
375 $ncbs_start = undef;
376 }
377 }
378
379 if (defined($ncbs_start)) { # NCBS at the end of contig
380 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$end);
381 push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start,
382 -end=>$ncbs_end,
383 -strand=>$orientation,
384 -primary=>'not_confirmed_on_both_strands') );
385 }
386 }
387
388 return @NCBS;
389 }
390
391 =head2 single_strand
392
393 Title : single_strand
394 Usage : my $sfc = $ContigAnal->single_strand();
395 Function :
396
397 Locates all regions covered by aligned sequences only in
398 one of the two strands, i.e., regions for which aligned
399 sequence's strand() method returns +1 or -1 for all
400 sequences.
401
402 Returns : an array of Bio::SeqFeature::Generic objects
403 Args : optional arguments are
404 -start : start of interval that will be analyzed
405 -end : start of interval that will be analyzed
406 -type : coordinate system type for interval
407
408 =cut
409
410 #'
411 sub single_strand {
412 my ($self,@args) = shift; # Package reference
413
414 my ($start,$end,$type) =
415 $self->_rearrange([qw(START END TYPE)],@args);
416
417 # Changing coordinates to gap mode align (consed: consensus sequence with alignments)
418 $type = 'gapped consensus' unless(defined($type));
419 $start = 1 unless (defined($start));
420 if (defined($type) && $type ne 'gapped consensus') {
421 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start);
422 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
423 }
424 ($end) = $self->{'_objref'}->get_consensus_length unless (defined($end));
425
426 # Loading complete list of coordinates for aligned sequences
427 my $sfc = $self->{'_objref'}->get_features_collection();
428 my @forward = grep { $_->primary_tag =~ /^_aligned_coord:/ }
429 $sfc->features_in_range(-start=>$start,
430 -end=>$end,
431 -contain=>0,
432 -strand=>1,
433 -strandmatch=>'strong');
434 my @reverse = grep { $_->primary_tag =~ /^_aligned_coord:/ }
435 $sfc->features_in_range(-start=>$start,
436 -end=>$end,
437 -contain=>0,
438 -strand=>-1,
439 -strandmatch=>'strong');
440 # Merging overlapping features
441 @forward = $self->_merge_overlapping_features(@forward);
442 @reverse = $self->_merge_overlapping_features(@reverse);
443
444 # Finding single stranded regions
445 my ($length) = $self->{'_objref'}->get_consensus_length;
446 $length = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$length);
447 @forward = $self->_complementary_features_list(1,$length,@forward);
448 @reverse = $self->_complementary_features_list(1,$length,@reverse);
449
450 my @SS = ();
451 foreach my $feat (@forward, @reverse) {
452 $feat->primary_tag('single_strand_region');
453 push(@SS,$feat);
454 }
455
456 return @SS;
457 }
458
459 =head1 Internal Methods
460
461 =head2 _merge_overlapping_features
462
463 Title : _merge_overlapping_features
464 Usage : my @feat = $ContigAnal->_merge_overlapping_features(@features);
465 Function : Merge all overlapping features into features
466 that hold original features as sub-features
467 Returns : array of Bio::SeqFeature::Generic objects
468 Args : array of Bio::SeqFeature::Generic objects
469
470 =cut
471
472 sub _merge_overlapping_features {
473 my ($self,@feat) = @_;
474
475 $self->throw_not_implemented();
476 }
477
478 =head2 _complementary_features_list
479
480 Title : _complementary_features_list
481 Usage : @feat = $ContigAnal->_complementary_features_list($start,$end,@features);
482 Function : Build a list of features for regions
483 not covered by features in @features array
484 Returns : array of Bio::SeqFeature::Generic objects
485 Args :
486 $start : [integer] start of first output feature
487 $end : [integer] end of last output feature
488 @features : array of Bio::SeqFeature::Generic objects
489
490 =cut
491
492 sub _complementary_features_list {
493 my ($self,$start,$end,@feat) = @_;
494
495 $self->throw_not_implemented();
496 }
497
498 1;
499
500 __END__