comparison variant_effect_predictor/Bio/EnsEMBL/DnaDnaAlignFeature.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::DnaDnaAlignFeature - Ensembl specific dna-dna pairwise
24 alignment feature
25
26 =head1 SYNOPSIS
27
28 See BaseAlignFeature
29
30 =cut
31
32
33 package Bio::EnsEMBL::DnaDnaAlignFeature;
34
35 use strict;
36
37 use Bio::EnsEMBL::BaseAlignFeature;
38
39 use vars qw(@ISA);
40 use Bio::SimpleAlign;
41 use Bio::LocatableSeq;
42 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
43
44 @ISA = qw( Bio::EnsEMBL::BaseAlignFeature );
45
46
47 =head2 new
48
49 Arg [..] : List of named arguments. (-pair_dna_align_feature_id) defined
50 in this constructor, others defined in BaseFeaturePair and
51 SeqFeature superclasses.
52 Example : $daf = new DnaDnaAlignFeature(-cigar_string => '3M3I12M');
53 Description: Creates a new DnaDnaAlignFeature using either a cigarstring or
54 a list of ungapped features.
55 Returntype : Bio::EnsEMBL::DnaDnaAlignFeature
56 Exceptions : none
57 Caller : general
58 Status : Stable
59
60 =cut
61
62 sub new {
63
64 my $caller = shift;
65
66 my $class = ref($caller) || $caller;
67
68 my $self = $class->SUPER::new(@_);
69
70 my ($pair_dna_align_feature_id) = rearrange([qw(PAIR_DNA_ALIGN_FEATURE_ID)], @_);
71 if (defined $pair_dna_align_feature_id){
72 $self->{'pair_dna_align_feature_id'} = $pair_dna_align_feature_id;
73 }
74 return $self;
75 }
76
77
78 =head2 pair_dna_align_feature_id
79
80 Arg[1] : (optional) String $arg - value to set
81 Example : $self->pair_dna_align_feature_id($pair_feature_id);
82 Description: Getter/setter for attribute 'pair_dna_align_feature_id'
83 The id of the dna feature aligned
84 Returntype : String
85 Exceptions : none
86 Caller : general
87 Status : Stable
88
89 =cut
90
91 sub pair_dna_align_feature_id{
92 my ($self, $arg) = @_;
93 if (defined $arg){
94 $self->{pair_dna_align_feature_id} = $arg;
95 }
96 return $self->{pair_dna_align_feature_id};
97 }
98
99 =head2 _hit_unit
100
101 Arg [1] : none
102 Description: PRIVATE implementation of abstract superclass method. Returns
103 1 as the 'unit' used for the hit sequence.
104 Returntype : int
105 Exceptions : none
106 Caller : Bio::EnsEMBL::BaseAlignFeature
107 Status : Stable
108
109 =cut
110
111 sub _hit_unit {
112 return 1;
113 }
114
115
116
117 =head2 _query_unit
118
119 Arg [1] : none
120 Description: PRIVATE implementation of abstract superclass method Returns
121 1 as the 'unit' used for the hit sequence.
122 Returntype : int
123 Exceptions : none
124 Caller : Bio::EnsEMBL::BaseAlignFeature
125 Status : Stable
126
127 =cut
128
129 sub _query_unit {
130 return 1;
131 }
132
133 =head2 restrict_between_positions
134
135 Arg [1] : int $start
136 Arg [2] : int $end
137 Arg [3] : string $flags
138 SEQ = $start and $end apply to the seq sequence
139 i.e. start and end methods
140 HSEQ = $start and $end apply to the hseq sequence
141 i.e. hstart and hend methods
142 Example : $daf->restrict_between_positions(150,543,"SEQ")
143 Description: Build a new DnaDnaAlignFeature object that fits within
144 the new specified coordinates and sequence reference, cutting
145 any pieces hanging upstream and downstream.
146 Returntype : Bio::EnsEMBL::DnaDnaAlignFeature object
147 Exceptions :
148 Caller :
149 Status : Stable
150
151 =cut
152
153 sub restrict_between_positions {
154 my ($self,$start,$end,$seqref) = @_;
155
156 unless (defined $start && $start =~ /^\d+$/) {
157 $self->throw("The first argument is not defined or is not an integer");
158 }
159 unless (defined $end && $end =~ /^\d+$/) {
160 $self->throw("The second argument is not defined or is not an integer");
161 }
162 unless (defined $seqref &&
163 ($seqref eq "SEQ" || $seqref eq "HSEQ")) {
164 $self->throw("The third argument is not defined or is not equal to 'SEQ' or 'HSEQ'");
165 }
166
167 # symbolic method references should be forbidden!
168 # need to be rewrite at some stage.
169
170 my ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) =
171 qw(start end strand hstart hend hstrand);
172
173 if ($seqref eq "HSEQ") {
174 ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) =
175 qw(hstart hend hstrand start end strand);
176 }
177
178 my @restricted_features;
179
180 foreach my $ungapped_feature ($self->ungapped_features) {
181
182 if ($ungapped_feature->$start_method1() > $end ||
183 $ungapped_feature->$end_method1() < $start) {
184
185 next;
186
187 } elsif ($ungapped_feature->$end_method1() <= $end &&
188 $ungapped_feature->$start_method1() >= $start) {
189
190 push @restricted_features, $ungapped_feature;
191
192 } else {
193
194 if ($ungapped_feature->$strand_method1() eq $ungapped_feature->$strand_method2()) {
195
196 if ($ungapped_feature->$start_method1() < $start) {
197
198 my $offset = $start - $ungapped_feature->$start_method1();
199 $ungapped_feature->$start_method1($start);
200 $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset);
201
202 }
203 if ($ungapped_feature->$end_method1() > $end) {
204
205 my $offset = $ungapped_feature->$end_method1() - $end;
206 $ungapped_feature->$end_method1($end);
207 $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset);
208
209 }
210 } else {
211
212 if ($ungapped_feature->$start_method1() < $start) {
213
214 my $offset = $start - $ungapped_feature->$start_method1();
215 $ungapped_feature->$start_method1($start);
216 $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset);
217
218 }
219 if ($ungapped_feature->$end_method1() > $end) {
220
221 my $offset = $ungapped_feature->$end_method1() - $end;
222 $ungapped_feature->$end_method1($end);
223 $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset);
224
225 }
226 }
227
228 push @restricted_features, $ungapped_feature;
229 }
230 }
231
232 if (scalar @restricted_features) {
233 my $DnaDnaAlignFeature = new Bio::EnsEMBL::DnaDnaAlignFeature('-features' =>\@restricted_features);
234 if (defined $self->slice) {
235 $DnaDnaAlignFeature->slice($self->slice);
236 }
237 if (defined $self->hslice) {
238 $DnaDnaAlignFeature->hslice($self->hslice);
239 }
240 return $DnaDnaAlignFeature;
241 } else {
242 return undef;
243 }
244 }
245
246 =head2 alignment_strings
247
248 Arg [1] : list of string $flags
249 FIX_SEQ = does not introduce gaps (dashes) in seq aligned sequence
250 and delete the corresponding insertions in hseq aligned sequence
251 FIX_HSEQ = does not introduce gaps (dashes) in hseq aligned sequence
252 and delete the corresponding insertions in seq aligned sequence
253 NO_SEQ = return the seq aligned sequence as an empty string
254 NO_HSEQ = return the hseq aligned sequence as an empty string
255 This 2 last flags would save a bit of time as doing so no querying to the core
256 database in done to get the sequence.
257 Example : $daf->alignment_strings or
258 $daf->alignment_strings("FIX_HSEQ") or
259 $daf->alignment_strings("NO_SEQ","FIX_SEQ")
260 Description: Allows to rebuild the alignment string of both the seq and hseq sequence
261 using the cigar_string information and the slice and hslice objects
262 Returntype : array reference containing 2 strings
263 the first corresponds to seq
264 the second corresponds to hseq
265 Exceptions :
266 Caller :
267 Status : Stable
268
269 =cut
270
271
272 sub alignment_strings {
273 my ( $self, @flags ) = @_;
274
275 # set the flags
276 my $seq_flag = 1;
277 my $hseq_flag = 1;
278 my $fix_seq_flag = 0;
279 my $fix_hseq_flag = 0;
280
281 for my $flag ( @flags ) {
282 $seq_flag = 0 if ($flag eq "NO_SEQ");
283 $hseq_flag = 0 if ($flag eq "NO_HSEQ");
284 $fix_seq_flag = 1 if ($flag eq "FIX_SEQ");
285 $fix_hseq_flag = 1 if ($flag eq "FIX_HSEQ");
286 }
287
288 my ($seq, $hseq);
289 $seq = $self->slice->subseq($self->start, $self->end, $self->strand) if ($seq_flag || $fix_seq_flag);
290 $hseq = $self->hslice->subseq($self->hstart, $self->hend, $self->hstrand) if ($hseq_flag || $fix_hseq_flag);
291
292 my $rseq= "";
293 # rseq - result sequence
294 my $rhseq= "";
295 # rhseq - result hsequence
296
297 my $seq_pos = 0;
298 my $hseq_pos = 0;
299
300 my @cig = ( $self->cigar_string =~ /(\d*[DIM])/g );
301
302 for my $cigElem ( @cig ) {
303 my $cigType = substr( $cigElem, -1, 1 );
304 my $cigCount = substr( $cigElem, 0 ,-1 );
305 $cigCount = 1 unless $cigCount;
306
307 if( $cigType eq "M" ) {
308 $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag);
309 $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag);
310 $seq_pos += $cigCount;
311 $hseq_pos += $cigCount;
312 } elsif( $cigType eq "D" ) {
313 if( ! $fix_seq_flag ) {
314 $rseq .= "-" x $cigCount if ($seq_flag);
315 $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag);
316 }
317 $hseq_pos += $cigCount;
318 } elsif( $cigType eq "I" ) {
319 if( ! $fix_hseq_flag ) {
320 $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag);
321 $rhseq .= "-" x $cigCount if ($hseq_flag);
322 }
323 $seq_pos += $cigCount;
324 }
325 }
326 return [ $rseq,$rhseq ];
327 }
328
329 =head2 get_SimpleAlign
330
331 Arg [1] : list of string $flags
332 translated = by default, the sequence alignment will be on nucleotide. With translated flag
333 the aligned sequences are translated.
334 uc = by default aligned sequences are given in lower cases. With uc flag, the aligned
335 sequences are given in upper cases.
336 Example : $daf->get_SimpleAlign or
337 $daf->get_SimpleAlign("translated") or
338 $daf->get_SimpleAlign("translated","uc")
339 Description: Allows to rebuild the alignment string of both the seq and hseq sequence
340 using the cigar_string information and the slice and hslice objects
341 Returntype : a Bio::SimpleAlign object
342 Exceptions :
343 Caller :
344 Status : Stable
345
346 =cut
347
348 sub get_SimpleAlign {
349 my ( $self, @flags ) = @_;
350
351 # setting the flags
352 my $uc = 0;
353 my $translated = 0;
354
355 for my $flag ( @flags ) {
356 $uc = 1 if ($flag =~ /^uc$/i);
357 $translated = 1 if ($flag =~ /^translated$/i);
358 }
359
360 my $sa = Bio::SimpleAlign->new();
361
362 #Hack to try to work with both bioperl 0.7 and 1.2:
363 #Check to see if the method is called 'addSeq' or 'add_seq'
364 my $bio07 = 0;
365 if(!$sa->can('add_seq')) {
366 $bio07 = 1;
367 }
368
369 my ($sb_seq,$qy_seq) = @{$self->alignment_strings};
370
371 my $loc_sb_seq = Bio::LocatableSeq->new(-SEQ => $uc ? uc $sb_seq : lc $sb_seq,
372 -START => $self->seq_region_start,
373 -END => $self->seq_region_end,
374 -ID => $self->seqname,
375 -STRAND => $self->strand);
376
377 $loc_sb_seq->seq($uc ? uc $loc_sb_seq->translate->seq
378 : lc $loc_sb_seq->translate->seq) if ($translated);
379
380 my $loc_qy_seq = Bio::LocatableSeq->new(-SEQ => $uc ? uc $qy_seq : lc $qy_seq,
381 -START => $self->hseq_region_start,
382 -END => $self->hseq_region_end,
383 -ID => $self->hseqname,
384 -STRAND => $self->hstrand);
385
386 $loc_qy_seq->seq($uc ? uc $loc_qy_seq->translate->seq
387 : lc $loc_qy_seq->translate->seq) if ($translated);
388
389 if($bio07) {
390 $sa->addSeq($loc_sb_seq);
391 $sa->addSeq($loc_qy_seq);
392 } else {
393 $sa->add_seq($loc_sb_seq);
394 $sa->add_seq($loc_qy_seq);
395 }
396
397 return $sa;
398 }
399
400 1;