Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Assembly/ContigAnalysis.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: ContigAnalysis.pm,v 1.2 2002/12/01 00:03:28 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::Assembly::ContigAnalysis | |
4 # | |
5 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br> | |
6 # | |
7 # Copyright Robson Francisco de Souza | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 # | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Assembly::ContigAnalysis - | |
16 Perform analysis on sequence assembly contigs. | |
17 | |
18 =head1 SYNOPSIS | |
19 | |
20 # Module loading | |
21 use Bio::Assembly::ContigAnalysis; | |
22 | |
23 # Assembly loading methods | |
24 my $ca = new Bio::Assembly::ContigAnalysis( -contig=>$contigOBJ ); | |
25 | |
26 my @lcq = $ca->low_consensus_quality; | |
27 my @hqd = $ca->high_quality_discrepancies; | |
28 my @ss = $ca->single_strand_regions; | |
29 | |
30 =head1 DESCRIPTION | |
31 | |
32 A contig is as a set of sequences, locally aligned to each other, when | |
33 the sequences in a pair may be aligned. It may also include a | |
34 consensus sequence. Bio::Assembly::ContigAnalysis is a module | |
35 holding a collection of methods to analyze contig objects. It was | |
36 developed around the Bio::Assembly::Contig implementation of contigs and | |
37 can not work with another contig interface. | |
38 | |
39 =head1 FEEDBACK | |
40 | |
41 =head2 Mailing Lists | |
42 | |
43 User feedback is an integral part of the evolution of this and other | |
44 Bioperl modules. Send your comments and suggestions preferably to the | |
45 Bioperl mailing lists Your participation is much appreciated. | |
46 | |
47 bioperl-l@bioperl.org - General discussion | |
48 http://bio.perl.org/MailList.html - About the mailing lists | |
49 | |
50 =head2 Reporting Bugs | |
51 | |
52 Report bugs to the Bioperl bug tracking system to help us keep track | |
53 the bugs and their resolution. Bug reports can be submitted via email | |
54 or the web: | |
55 | |
56 bioperl-bugs@bio.perl.org | |
57 http://bugzilla.bioperl.org/ | |
58 | |
59 =head1 AUTHOR - Robson Francisco de Souza | |
60 | |
61 Email: rfsouza@citri.iq.usp.br | |
62 | |
63 =head1 APPENDIX | |
64 | |
65 The rest of the documentation details each of the object | |
66 methods. Internal methods are usually preceded with a _ | |
67 | |
68 =cut | |
69 | |
70 package Bio::Assembly::ContigAnalysis; | |
71 | |
72 use Bio::Root::Root; | |
73 use strict; | |
74 use vars qw(@ISA); | |
75 | |
76 @ISA = qw(Bio::Root::Root); | |
77 | |
78 =head1 Object creator | |
79 | |
80 =head2 new | |
81 | |
82 Title : new | |
83 Usage : my $contig = Bio::Assembly::ContigAnalysis->new(-contig=>$contigOBJ); | |
84 Function : Creates a new contig analysis object | |
85 Returns : Bio::Assembly::ContigAnalysis | |
86 Args : | |
87 -contig : a Bio::Assembly::Contig object | |
88 | |
89 =cut | |
90 | |
91 sub new { | |
92 my($class,@args) = @_; | |
93 my $self = $class->SUPER::new(@args); | |
94 | |
95 my ($contigOBJ) = $self->_rearrange([qw(CONTIG)],@args); | |
96 unless ($contigOBJ->isa("Bio::Assembly::Contig")) { | |
97 $self->throw("ContigAnal works only on Bio::Assembly::Contig objects\n"); | |
98 } | |
99 | |
100 $self->{'_objref'} = $contigOBJ; | |
101 return $self; | |
102 } | |
103 | |
104 =head1 Analysis methods | |
105 | |
106 =head2 high_quality_discrepancies | |
107 | |
108 Title : high_quality_discrepancies | |
109 Usage : my $sfc = $ContigAnal->high_quality_discrepancies(); | |
110 Function : | |
111 | |
112 Locates all high quality discrepancies among aligned | |
113 sequences and the consensus sequence. | |
114 | |
115 Note: see Bio::Assembly::Contig POD documentation, | |
116 section "Coordinate System", for a definition of | |
117 available types. Default coordinate system type is | |
118 "gapped consensus", i.e. consensus sequence (with gaps) | |
119 coordinates. If limits are not specified, the entire | |
120 alignment is analyzed. | |
121 | |
122 Returns : Bio::SeqFeature::Collection | |
123 Args : optional arguments are | |
124 -threshold : cutoff value for low quality (minimum high quality) | |
125 Default: 40 | |
126 -ignore : number of bases that will not be analysed at | |
127 both ends of contig aligned elements | |
128 Default: 5 | |
129 -start : start of interval that will be analyzed | |
130 -end : start of interval that will be analyzed | |
131 -type : coordinate system type for interval | |
132 | |
133 =cut | |
134 | |
135 sub high_quality_discrepancies { | |
136 my ($self,@args) = shift; # Package reference | |
137 | |
138 my ($threshold,$ignore,$start,$end,$type) = | |
139 $self->_rearrange([qw(THRESHOLD IGNORE START END TYPE)],@args); | |
140 | |
141 # Defining default threhold and HQD_ignore | |
142 $threshold = 40 unless (defined($threshold)); | |
143 $ignore = 5 unless (defined($ignore)); | |
144 $type = 'gapped consensus' unless (defined($type)); | |
145 | |
146 # Changing input coordinates system (if needed) | |
147 if (defined $start && $type ne 'gapped consensus') { | |
148 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start); | |
149 } elsif (!defined $start) { | |
150 $start = 1; | |
151 } | |
152 if (defined $end && $type ne 'gapped consensus') { | |
153 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end); | |
154 } elsif (!defined $end) { | |
155 $end = $self->{'_objref'}->get_consensus_length(); | |
156 } | |
157 | |
158 # Scanning each read sequence and the contig sequence and | |
159 # adding discrepancies to Bio::SeqFeature::Collection | |
160 my @seqIDs = $self->{'_objref'}->get_seq_ids(-start=>$start, | |
161 -end=>$end, | |
162 -type=>$type); | |
163 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq; | |
164 | |
165 my @HQD = (); | |
166 foreach my $seqID (@seqIDs) { | |
167 # Setting aligned read sub-sequence limits and loading data | |
168 my $seq = $self->{'_objref'}->get_seq_by_name($seqID); | |
169 my $qual = $self->{'_objref'}->get_qual_by_name($seqID); | |
170 unless (defined $qual) { | |
171 $self->warn("Can't correctly evaluate HQD without aligned sequence qualities for $seqID"); | |
172 next; | |
173 } | |
174 my $sequence = $seq->seq; | |
175 my @quality = @{ $qual->qual }; | |
176 | |
177 # Scanning the aligned region of each read | |
178 my $seq_ix = 0; | |
179 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID"); | |
180 my ($astart,$aend) = ($coord->start,$coord->end); | |
181 $astart = $astart + $ignore; # Redefining limits to evaluate HQDs (jump $ignore at start) | |
182 $aend = $aend - $ignore; # Redefining limits to evaluate HQDs (stop $ignore before end) | |
183 | |
184 my ($d_start,$d_end,$i); | |
185 for ($i=$astart-1; $i<=$aend-1; $i++) { | |
186 # Changing coordinate $i+1 from 'gapped consensus' mode to "aligned $seqID" (coordinate $seq_ix) | |
187 $seq_ix = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$i+1); | |
188 next unless (($i >= $start) && ($i <= $end)); | |
189 | |
190 my $r_base = uc(substr($sequence,$seq_ix-1,1)); | |
191 my $c_base = uc(substr($consensus,$i,1)); | |
192 | |
193 # Discrepant region start: store $d_start and $type | |
194 (!defined($d_start) && | |
195 ($r_base ne $c_base) && | |
196 ($quality[$seq_ix-1] >= $threshold)) && do { | |
197 $d_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i+1); | |
198 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n"; | |
199 next; | |
200 }; | |
201 | |
202 # Quality change or end of discrepant region: store limits and undef $d_start | |
203 if (defined($d_start) && | |
204 (($quality[$seq_ix-1] < $threshold) || | |
205 (uc($r_base) eq uc($c_base)))) { | |
206 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); | |
207 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n"; | |
208 push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID", | |
209 -start=>$d_start, | |
210 -end=>$d_end, | |
211 -strand=>$seq->strand()) ); | |
212 $d_start = undef; | |
213 next; | |
214 } | |
215 } # for ($i=$astart-1; $i<=$aend-1; $i++) | |
216 | |
217 # Loading discrepancies located at sub-sequence end, if any. | |
218 if (defined($d_start)) { | |
219 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); | |
220 push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID", | |
221 -start=>$d_start, | |
222 -end=>$d_end, | |
223 -strand=>$seq->strand()) ); | |
224 } | |
225 } # foreach my $seqID (@seqIDs) | |
226 | |
227 return @HQD; | |
228 } | |
229 | |
230 =head2 low_consensus_quality | |
231 | |
232 Title : low_consensus_quality | |
233 Usage : my $sfc = $ContigAnal->low_consensus_quality(); | |
234 Function : Locates all low quality regions in the consensus | |
235 Returns : an array of Bio::SeqFeature::Generic objects | |
236 Args : optional arguments are | |
237 -threshold : cutoff value for low quality (minimum high quality) | |
238 Default: 25 | |
239 -start : start of interval that will be analyzed | |
240 -end : start of interval that will be analyzed | |
241 -type : coordinate system type for interval | |
242 | |
243 =cut | |
244 | |
245 sub low_consensus_quality { | |
246 my ($self,@args) = shift; # Packege reference | |
247 | |
248 my ($threshold,$start,$end,$type) = | |
249 $self->_rearrange([qw(THRESHOLD START END TYPE)],@args); | |
250 | |
251 # Setting default value for threshold | |
252 $threshold = 25 unless (defined($threshold)); | |
253 | |
254 # Loading qualities | |
255 my @quality = @{ $self->{'_objref'}->get_consensus_quality()->qual }; | |
256 | |
257 # Changing coordinates to gap mode noaln (consed: consensus without alignments) | |
258 $start = 1 unless (defined($start)); | |
259 if (defined $start && defined $type && ($type ne 'gapped consensus')) { | |
260 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start); | |
261 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end)); | |
262 } | |
263 $end = $self->{'_objref'}->get_consensus_length unless (defined $end); | |
264 | |
265 # Scanning @quality vector and storing intervals limits with base qualities less then | |
266 # the threshold value | |
267 my ($lcq_start); | |
268 my ($i,@LCQ); | |
269 for ($i=$start-1; $i<=$end-1; $i++) { | |
270 # print $quality[$i],"\t",$i,"\n"; | |
271 if (!defined($lcq_start) && (($quality[$i] <= $threshold) || ($quality[$i] == 98))) { | |
272 $lcq_start = $i+1; | |
273 } elsif (defined($lcq_start) && ($quality[$i] > $threshold)) { | |
274 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start); | |
275 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); | |
276 push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start, | |
277 -end=>$lcq_end, | |
278 -primary=>'low_consensus_quality') ); | |
279 $lcq_start = undef; | |
280 } | |
281 } | |
282 | |
283 if (defined $lcq_start) { | |
284 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start); | |
285 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); | |
286 push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start, | |
287 -end=>$lcq_end, | |
288 -primary=>'low_consensus_quality') ); | |
289 } | |
290 | |
291 return @LCQ; | |
292 } | |
293 | |
294 =head2 not_confirmed_on_both_strands | |
295 | |
296 Title : low_quality_consensus | |
297 Usage : my $sfc = $ContigAnal->low_quality_consensus(); | |
298 Function : | |
299 | |
300 Locates all regions whose consensus bases were not | |
301 confirmed by bases from sequences aligned in both | |
302 orientations, i.e., in such regions, no bases in aligned | |
303 sequences of either +1 or -1 strand agree with the | |
304 consensus bases. | |
305 | |
306 Returns : an array of Bio::SeqFeature::Generic objects | |
307 Args : optional arguments are | |
308 -start : start of interval that will be analyzed | |
309 -end : start of interval that will be analyzed | |
310 -type : coordinate system type for interval | |
311 | |
312 =cut | |
313 | |
314 sub not_confirmed_on_both_strands { | |
315 my ($self,@args) = shift; # Package reference | |
316 | |
317 my ($start,$end,$type) = | |
318 $self->_rearrange([qw(START END TYPE)],@args); | |
319 | |
320 # Changing coordinates to default system 'align' (contig sequence with alignments) | |
321 $start = 1 unless (defined($start)); | |
322 if (defined($type) && ($type ne 'gapped consensus')) { | |
323 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start); | |
324 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end) if (defined($end)); | |
325 } | |
326 $end = $self->{'_objref'}->get_consensus_length unless (defined($end)); | |
327 | |
328 # Scanning alignment | |
329 my %confirmed = (); # If ($confirmed{$orientation}[$i] > 0) then $i is confirmed in $orientation strand | |
330 my ($i); | |
331 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq; | |
332 foreach my $seqID ($self->{'_objref'}->get_seq_ids) { | |
333 # Setting aligned read sub-sequence limits and loading data | |
334 my $seq = $self->{'_objref'}->get_seq_by_name($seqID); | |
335 my $sequence = $seq->seq; | |
336 | |
337 # Scanning the aligned regions of each read and registering confirmed sites | |
338 my $contig_ix = 0; | |
339 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID"); | |
340 my ($astart,$aend,$orientation) = ($coord->start,$coord->end,$coord->strand); | |
341 $astart = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$astart); | |
342 $aend = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$aend); | |
343 for ($i=$astart-1; $i<=$aend-1; $i++) { | |
344 # $i+1 in 'align' mode is $contig_ix | |
345 $contig_ix = $self->{'_objref'}->change_coord("aligned $seqID",'gapped consensus',$i+1); | |
346 next unless (($contig_ix >= $start) && ($contig_ix <= $end)); | |
347 my $r_base = uc(substr($sequence,$i,1)); | |
348 my $c_base = uc(substr($consensus,$contig_ix-1,1)); | |
349 if ($c_base eq '-') { | |
350 $confirmed{$orientation}[$contig_ix] = -1; | |
351 } elsif (uc($r_base) eq uc($c_base)) { # Non discrepant region found | |
352 $confirmed{$orientation}[$contig_ix]++; | |
353 } | |
354 } # for ($i=$astart-1; $i<=$aend-1; $i++) | |
355 } # foreach $seqID (@reads) | |
356 | |
357 # Locating non-confirmed aligned regions for each orientation in $confirmed registry | |
358 my ($orientation); | |
359 my @NCBS = (); | |
360 foreach $orientation (keys %confirmed) { | |
361 my ($ncbs_start,$ncbs_end); | |
362 | |
363 for ($i=$start; $i<=$end; $i++) { | |
364 if (!defined($ncbs_start) && | |
365 (!defined($confirmed{$orientation}[$i]) || ($confirmed{$orientation}[$i] == 0))) { | |
366 $ncbs_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); | |
367 } elsif (defined($ncbs_start) && | |
368 defined($confirmed{$orientation}[$i]) && | |
369 ($confirmed{$orientation}[$i] > 0)) { | |
370 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i-1); | |
371 push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start, | |
372 -end=>$ncbs_end, | |
373 -strand=>$orientation, | |
374 -primary=>"not_confirmed_on_both_strands") ); | |
375 $ncbs_start = undef; | |
376 } | |
377 } | |
378 | |
379 if (defined($ncbs_start)) { # NCBS at the end of contig | |
380 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$end); | |
381 push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start, | |
382 -end=>$ncbs_end, | |
383 -strand=>$orientation, | |
384 -primary=>'not_confirmed_on_both_strands') ); | |
385 } | |
386 } | |
387 | |
388 return @NCBS; | |
389 } | |
390 | |
391 =head2 single_strand | |
392 | |
393 Title : single_strand | |
394 Usage : my $sfc = $ContigAnal->single_strand(); | |
395 Function : | |
396 | |
397 Locates all regions covered by aligned sequences only in | |
398 one of the two strands, i.e., regions for which aligned | |
399 sequence's strand() method returns +1 or -1 for all | |
400 sequences. | |
401 | |
402 Returns : an array of Bio::SeqFeature::Generic objects | |
403 Args : optional arguments are | |
404 -start : start of interval that will be analyzed | |
405 -end : start of interval that will be analyzed | |
406 -type : coordinate system type for interval | |
407 | |
408 =cut | |
409 | |
410 #' | |
411 sub single_strand { | |
412 my ($self,@args) = shift; # Package reference | |
413 | |
414 my ($start,$end,$type) = | |
415 $self->_rearrange([qw(START END TYPE)],@args); | |
416 | |
417 # Changing coordinates to gap mode align (consed: consensus sequence with alignments) | |
418 $type = 'gapped consensus' unless(defined($type)); | |
419 $start = 1 unless (defined($start)); | |
420 if (defined($type) && $type ne 'gapped consensus') { | |
421 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start); | |
422 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end)); | |
423 } | |
424 ($end) = $self->{'_objref'}->get_consensus_length unless (defined($end)); | |
425 | |
426 # Loading complete list of coordinates for aligned sequences | |
427 my $sfc = $self->{'_objref'}->get_features_collection(); | |
428 my @forward = grep { $_->primary_tag =~ /^_aligned_coord:/ } | |
429 $sfc->features_in_range(-start=>$start, | |
430 -end=>$end, | |
431 -contain=>0, | |
432 -strand=>1, | |
433 -strandmatch=>'strong'); | |
434 my @reverse = grep { $_->primary_tag =~ /^_aligned_coord:/ } | |
435 $sfc->features_in_range(-start=>$start, | |
436 -end=>$end, | |
437 -contain=>0, | |
438 -strand=>-1, | |
439 -strandmatch=>'strong'); | |
440 # Merging overlapping features | |
441 @forward = $self->_merge_overlapping_features(@forward); | |
442 @reverse = $self->_merge_overlapping_features(@reverse); | |
443 | |
444 # Finding single stranded regions | |
445 my ($length) = $self->{'_objref'}->get_consensus_length; | |
446 $length = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$length); | |
447 @forward = $self->_complementary_features_list(1,$length,@forward); | |
448 @reverse = $self->_complementary_features_list(1,$length,@reverse); | |
449 | |
450 my @SS = (); | |
451 foreach my $feat (@forward, @reverse) { | |
452 $feat->primary_tag('single_strand_region'); | |
453 push(@SS,$feat); | |
454 } | |
455 | |
456 return @SS; | |
457 } | |
458 | |
459 =head1 Internal Methods | |
460 | |
461 =head2 _merge_overlapping_features | |
462 | |
463 Title : _merge_overlapping_features | |
464 Usage : my @feat = $ContigAnal->_merge_overlapping_features(@features); | |
465 Function : Merge all overlapping features into features | |
466 that hold original features as sub-features | |
467 Returns : array of Bio::SeqFeature::Generic objects | |
468 Args : array of Bio::SeqFeature::Generic objects | |
469 | |
470 =cut | |
471 | |
472 sub _merge_overlapping_features { | |
473 my ($self,@feat) = @_; | |
474 | |
475 $self->throw_not_implemented(); | |
476 } | |
477 | |
478 =head2 _complementary_features_list | |
479 | |
480 Title : _complementary_features_list | |
481 Usage : @feat = $ContigAnal->_complementary_features_list($start,$end,@features); | |
482 Function : Build a list of features for regions | |
483 not covered by features in @features array | |
484 Returns : array of Bio::SeqFeature::Generic objects | |
485 Args : | |
486 $start : [integer] start of first output feature | |
487 $end : [integer] end of last output feature | |
488 @features : array of Bio::SeqFeature::Generic objects | |
489 | |
490 =cut | |
491 | |
492 sub _complementary_features_list { | |
493 my ($self,$start,$end,@feat) = @_; | |
494 | |
495 $self->throw_not_implemented(); | |
496 } | |
497 | |
498 1; | |
499 | |
500 __END__ |