0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::DnaDnaAlignFeature - Ensembl specific dna-dna pairwise
|
|
24 alignment feature
|
|
25
|
|
26 =head1 SYNOPSIS
|
|
27
|
|
28 See BaseAlignFeature
|
|
29
|
|
30 =cut
|
|
31
|
|
32
|
|
33 package Bio::EnsEMBL::DnaDnaAlignFeature;
|
|
34
|
|
35 use strict;
|
|
36
|
|
37 use Bio::EnsEMBL::BaseAlignFeature;
|
|
38
|
|
39 use vars qw(@ISA);
|
|
40 use Bio::SimpleAlign;
|
|
41 use Bio::LocatableSeq;
|
|
42 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
|
|
43
|
|
44 @ISA = qw( Bio::EnsEMBL::BaseAlignFeature );
|
|
45
|
|
46
|
|
47 =head2 new
|
|
48
|
|
49 Arg [..] : List of named arguments. (-pair_dna_align_feature_id) defined
|
|
50 in this constructor, others defined in BaseFeaturePair and
|
|
51 SeqFeature superclasses.
|
|
52 Example : $daf = new DnaDnaAlignFeature(-cigar_string => '3M3I12M');
|
|
53 Description: Creates a new DnaDnaAlignFeature using either a cigarstring or
|
|
54 a list of ungapped features.
|
|
55 Returntype : Bio::EnsEMBL::DnaDnaAlignFeature
|
|
56 Exceptions : none
|
|
57 Caller : general
|
|
58 Status : Stable
|
|
59
|
|
60 =cut
|
|
61
|
|
62 sub new {
|
|
63
|
|
64 my $caller = shift;
|
|
65
|
|
66 my $class = ref($caller) || $caller;
|
|
67
|
|
68 my $self = $class->SUPER::new(@_);
|
|
69
|
|
70 my ($pair_dna_align_feature_id) = rearrange([qw(PAIR_DNA_ALIGN_FEATURE_ID)], @_);
|
|
71 if (defined $pair_dna_align_feature_id){
|
|
72 $self->{'pair_dna_align_feature_id'} = $pair_dna_align_feature_id;
|
|
73 }
|
|
74 return $self;
|
|
75 }
|
|
76
|
|
77
|
|
78 =head2 pair_dna_align_feature_id
|
|
79
|
|
80 Arg[1] : (optional) String $arg - value to set
|
|
81 Example : $self->pair_dna_align_feature_id($pair_feature_id);
|
|
82 Description: Getter/setter for attribute 'pair_dna_align_feature_id'
|
|
83 The id of the dna feature aligned
|
|
84 Returntype : String
|
|
85 Exceptions : none
|
|
86 Caller : general
|
|
87 Status : Stable
|
|
88
|
|
89 =cut
|
|
90
|
|
91 sub pair_dna_align_feature_id{
|
|
92 my ($self, $arg) = @_;
|
|
93 if (defined $arg){
|
|
94 $self->{pair_dna_align_feature_id} = $arg;
|
|
95 }
|
|
96 return $self->{pair_dna_align_feature_id};
|
|
97 }
|
|
98
|
|
99 =head2 _hit_unit
|
|
100
|
|
101 Arg [1] : none
|
|
102 Description: PRIVATE implementation of abstract superclass method. Returns
|
|
103 1 as the 'unit' used for the hit sequence.
|
|
104 Returntype : int
|
|
105 Exceptions : none
|
|
106 Caller : Bio::EnsEMBL::BaseAlignFeature
|
|
107 Status : Stable
|
|
108
|
|
109 =cut
|
|
110
|
|
111 sub _hit_unit {
|
|
112 return 1;
|
|
113 }
|
|
114
|
|
115
|
|
116
|
|
117 =head2 _query_unit
|
|
118
|
|
119 Arg [1] : none
|
|
120 Description: PRIVATE implementation of abstract superclass method Returns
|
|
121 1 as the 'unit' used for the hit sequence.
|
|
122 Returntype : int
|
|
123 Exceptions : none
|
|
124 Caller : Bio::EnsEMBL::BaseAlignFeature
|
|
125 Status : Stable
|
|
126
|
|
127 =cut
|
|
128
|
|
129 sub _query_unit {
|
|
130 return 1;
|
|
131 }
|
|
132
|
|
133 =head2 restrict_between_positions
|
|
134
|
|
135 Arg [1] : int $start
|
|
136 Arg [2] : int $end
|
|
137 Arg [3] : string $flags
|
|
138 SEQ = $start and $end apply to the seq sequence
|
|
139 i.e. start and end methods
|
|
140 HSEQ = $start and $end apply to the hseq sequence
|
|
141 i.e. hstart and hend methods
|
|
142 Example : $daf->restrict_between_positions(150,543,"SEQ")
|
|
143 Description: Build a new DnaDnaAlignFeature object that fits within
|
|
144 the new specified coordinates and sequence reference, cutting
|
|
145 any pieces hanging upstream and downstream.
|
|
146 Returntype : Bio::EnsEMBL::DnaDnaAlignFeature object
|
|
147 Exceptions :
|
|
148 Caller :
|
|
149 Status : Stable
|
|
150
|
|
151 =cut
|
|
152
|
|
153 sub restrict_between_positions {
|
|
154 my ($self,$start,$end,$seqref) = @_;
|
|
155
|
|
156 unless (defined $start && $start =~ /^\d+$/) {
|
|
157 $self->throw("The first argument is not defined or is not an integer");
|
|
158 }
|
|
159 unless (defined $end && $end =~ /^\d+$/) {
|
|
160 $self->throw("The second argument is not defined or is not an integer");
|
|
161 }
|
|
162 unless (defined $seqref &&
|
|
163 ($seqref eq "SEQ" || $seqref eq "HSEQ")) {
|
|
164 $self->throw("The third argument is not defined or is not equal to 'SEQ' or 'HSEQ'");
|
|
165 }
|
|
166
|
|
167 # symbolic method references should be forbidden!
|
|
168 # need to be rewrite at some stage.
|
|
169
|
|
170 my ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) =
|
|
171 qw(start end strand hstart hend hstrand);
|
|
172
|
|
173 if ($seqref eq "HSEQ") {
|
|
174 ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) =
|
|
175 qw(hstart hend hstrand start end strand);
|
|
176 }
|
|
177
|
|
178 my @restricted_features;
|
|
179
|
|
180 foreach my $ungapped_feature ($self->ungapped_features) {
|
|
181
|
|
182 if ($ungapped_feature->$start_method1() > $end ||
|
|
183 $ungapped_feature->$end_method1() < $start) {
|
|
184
|
|
185 next;
|
|
186
|
|
187 } elsif ($ungapped_feature->$end_method1() <= $end &&
|
|
188 $ungapped_feature->$start_method1() >= $start) {
|
|
189
|
|
190 push @restricted_features, $ungapped_feature;
|
|
191
|
|
192 } else {
|
|
193
|
|
194 if ($ungapped_feature->$strand_method1() eq $ungapped_feature->$strand_method2()) {
|
|
195
|
|
196 if ($ungapped_feature->$start_method1() < $start) {
|
|
197
|
|
198 my $offset = $start - $ungapped_feature->$start_method1();
|
|
199 $ungapped_feature->$start_method1($start);
|
|
200 $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset);
|
|
201
|
|
202 }
|
|
203 if ($ungapped_feature->$end_method1() > $end) {
|
|
204
|
|
205 my $offset = $ungapped_feature->$end_method1() - $end;
|
|
206 $ungapped_feature->$end_method1($end);
|
|
207 $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset);
|
|
208
|
|
209 }
|
|
210 } else {
|
|
211
|
|
212 if ($ungapped_feature->$start_method1() < $start) {
|
|
213
|
|
214 my $offset = $start - $ungapped_feature->$start_method1();
|
|
215 $ungapped_feature->$start_method1($start);
|
|
216 $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset);
|
|
217
|
|
218 }
|
|
219 if ($ungapped_feature->$end_method1() > $end) {
|
|
220
|
|
221 my $offset = $ungapped_feature->$end_method1() - $end;
|
|
222 $ungapped_feature->$end_method1($end);
|
|
223 $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset);
|
|
224
|
|
225 }
|
|
226 }
|
|
227
|
|
228 push @restricted_features, $ungapped_feature;
|
|
229 }
|
|
230 }
|
|
231
|
|
232 if (scalar @restricted_features) {
|
|
233 my $DnaDnaAlignFeature = new Bio::EnsEMBL::DnaDnaAlignFeature('-features' =>\@restricted_features);
|
|
234 if (defined $self->slice) {
|
|
235 $DnaDnaAlignFeature->slice($self->slice);
|
|
236 }
|
|
237 if (defined $self->hslice) {
|
|
238 $DnaDnaAlignFeature->hslice($self->hslice);
|
|
239 }
|
|
240 return $DnaDnaAlignFeature;
|
|
241 } else {
|
|
242 return undef;
|
|
243 }
|
|
244 }
|
|
245
|
|
246 =head2 alignment_strings
|
|
247
|
|
248 Arg [1] : list of string $flags
|
|
249 FIX_SEQ = does not introduce gaps (dashes) in seq aligned sequence
|
|
250 and delete the corresponding insertions in hseq aligned sequence
|
|
251 FIX_HSEQ = does not introduce gaps (dashes) in hseq aligned sequence
|
|
252 and delete the corresponding insertions in seq aligned sequence
|
|
253 NO_SEQ = return the seq aligned sequence as an empty string
|
|
254 NO_HSEQ = return the hseq aligned sequence as an empty string
|
|
255 This 2 last flags would save a bit of time as doing so no querying to the core
|
|
256 database in done to get the sequence.
|
|
257 Example : $daf->alignment_strings or
|
|
258 $daf->alignment_strings("FIX_HSEQ") or
|
|
259 $daf->alignment_strings("NO_SEQ","FIX_SEQ")
|
|
260 Description: Allows to rebuild the alignment string of both the seq and hseq sequence
|
|
261 using the cigar_string information and the slice and hslice objects
|
|
262 Returntype : array reference containing 2 strings
|
|
263 the first corresponds to seq
|
|
264 the second corresponds to hseq
|
|
265 Exceptions :
|
|
266 Caller :
|
|
267 Status : Stable
|
|
268
|
|
269 =cut
|
|
270
|
|
271
|
|
272 sub alignment_strings {
|
|
273 my ( $self, @flags ) = @_;
|
|
274
|
|
275 # set the flags
|
|
276 my $seq_flag = 1;
|
|
277 my $hseq_flag = 1;
|
|
278 my $fix_seq_flag = 0;
|
|
279 my $fix_hseq_flag = 0;
|
|
280
|
|
281 for my $flag ( @flags ) {
|
|
282 $seq_flag = 0 if ($flag eq "NO_SEQ");
|
|
283 $hseq_flag = 0 if ($flag eq "NO_HSEQ");
|
|
284 $fix_seq_flag = 1 if ($flag eq "FIX_SEQ");
|
|
285 $fix_hseq_flag = 1 if ($flag eq "FIX_HSEQ");
|
|
286 }
|
|
287
|
|
288 my ($seq, $hseq);
|
|
289 $seq = $self->slice->subseq($self->start, $self->end, $self->strand) if ($seq_flag || $fix_seq_flag);
|
|
290 $hseq = $self->hslice->subseq($self->hstart, $self->hend, $self->hstrand) if ($hseq_flag || $fix_hseq_flag);
|
|
291
|
|
292 my $rseq= "";
|
|
293 # rseq - result sequence
|
|
294 my $rhseq= "";
|
|
295 # rhseq - result hsequence
|
|
296
|
|
297 my $seq_pos = 0;
|
|
298 my $hseq_pos = 0;
|
|
299
|
|
300 my @cig = ( $self->cigar_string =~ /(\d*[DIM])/g );
|
|
301
|
|
302 for my $cigElem ( @cig ) {
|
|
303 my $cigType = substr( $cigElem, -1, 1 );
|
|
304 my $cigCount = substr( $cigElem, 0 ,-1 );
|
|
305 $cigCount = 1 unless $cigCount;
|
|
306
|
|
307 if( $cigType eq "M" ) {
|
|
308 $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag);
|
|
309 $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag);
|
|
310 $seq_pos += $cigCount;
|
|
311 $hseq_pos += $cigCount;
|
|
312 } elsif( $cigType eq "D" ) {
|
|
313 if( ! $fix_seq_flag ) {
|
|
314 $rseq .= "-" x $cigCount if ($seq_flag);
|
|
315 $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag);
|
|
316 }
|
|
317 $hseq_pos += $cigCount;
|
|
318 } elsif( $cigType eq "I" ) {
|
|
319 if( ! $fix_hseq_flag ) {
|
|
320 $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag);
|
|
321 $rhseq .= "-" x $cigCount if ($hseq_flag);
|
|
322 }
|
|
323 $seq_pos += $cigCount;
|
|
324 }
|
|
325 }
|
|
326 return [ $rseq,$rhseq ];
|
|
327 }
|
|
328
|
|
329 =head2 get_SimpleAlign
|
|
330
|
|
331 Arg [1] : list of string $flags
|
|
332 translated = by default, the sequence alignment will be on nucleotide. With translated flag
|
|
333 the aligned sequences are translated.
|
|
334 uc = by default aligned sequences are given in lower cases. With uc flag, the aligned
|
|
335 sequences are given in upper cases.
|
|
336 Example : $daf->get_SimpleAlign or
|
|
337 $daf->get_SimpleAlign("translated") or
|
|
338 $daf->get_SimpleAlign("translated","uc")
|
|
339 Description: Allows to rebuild the alignment string of both the seq and hseq sequence
|
|
340 using the cigar_string information and the slice and hslice objects
|
|
341 Returntype : a Bio::SimpleAlign object
|
|
342 Exceptions :
|
|
343 Caller :
|
|
344 Status : Stable
|
|
345
|
|
346 =cut
|
|
347
|
|
348 sub get_SimpleAlign {
|
|
349 my ( $self, @flags ) = @_;
|
|
350
|
|
351 # setting the flags
|
|
352 my $uc = 0;
|
|
353 my $translated = 0;
|
|
354
|
|
355 for my $flag ( @flags ) {
|
|
356 $uc = 1 if ($flag =~ /^uc$/i);
|
|
357 $translated = 1 if ($flag =~ /^translated$/i);
|
|
358 }
|
|
359
|
|
360 my $sa = Bio::SimpleAlign->new();
|
|
361
|
|
362 #Hack to try to work with both bioperl 0.7 and 1.2:
|
|
363 #Check to see if the method is called 'addSeq' or 'add_seq'
|
|
364 my $bio07 = 0;
|
|
365 if(!$sa->can('add_seq')) {
|
|
366 $bio07 = 1;
|
|
367 }
|
|
368
|
|
369 my ($sb_seq,$qy_seq) = @{$self->alignment_strings};
|
|
370
|
|
371 my $loc_sb_seq = Bio::LocatableSeq->new(-SEQ => $uc ? uc $sb_seq : lc $sb_seq,
|
|
372 -START => $self->seq_region_start,
|
|
373 -END => $self->seq_region_end,
|
|
374 -ID => $self->seqname,
|
|
375 -STRAND => $self->strand);
|
|
376
|
|
377 $loc_sb_seq->seq($uc ? uc $loc_sb_seq->translate->seq
|
|
378 : lc $loc_sb_seq->translate->seq) if ($translated);
|
|
379
|
|
380 my $loc_qy_seq = Bio::LocatableSeq->new(-SEQ => $uc ? uc $qy_seq : lc $qy_seq,
|
|
381 -START => $self->hseq_region_start,
|
|
382 -END => $self->hseq_region_end,
|
|
383 -ID => $self->hseqname,
|
|
384 -STRAND => $self->hstrand);
|
|
385
|
|
386 $loc_qy_seq->seq($uc ? uc $loc_qy_seq->translate->seq
|
|
387 : lc $loc_qy_seq->translate->seq) if ($translated);
|
|
388
|
|
389 if($bio07) {
|
|
390 $sa->addSeq($loc_sb_seq);
|
|
391 $sa->addSeq($loc_qy_seq);
|
|
392 } else {
|
|
393 $sa->add_seq($loc_sb_seq);
|
|
394 $sa->add_seq($loc_qy_seq);
|
|
395 }
|
|
396
|
|
397 return $sa;
|
|
398 }
|
|
399
|
|
400 1;
|