Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/DnaDnaAlignFeature.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::DnaDnaAlignFeature - Ensembl specific dna-dna pairwise | |
24 alignment feature | |
25 | |
26 =head1 SYNOPSIS | |
27 | |
28 See BaseAlignFeature | |
29 | |
30 =cut | |
31 | |
32 | |
33 package Bio::EnsEMBL::DnaDnaAlignFeature; | |
34 | |
35 use strict; | |
36 | |
37 use Bio::EnsEMBL::BaseAlignFeature; | |
38 | |
39 use vars qw(@ISA); | |
40 use Bio::SimpleAlign; | |
41 use Bio::LocatableSeq; | |
42 use Bio::EnsEMBL::Utils::Argument qw(rearrange); | |
43 | |
44 @ISA = qw( Bio::EnsEMBL::BaseAlignFeature ); | |
45 | |
46 | |
47 =head2 new | |
48 | |
49 Arg [..] : List of named arguments. (-pair_dna_align_feature_id) defined | |
50 in this constructor, others defined in BaseFeaturePair and | |
51 SeqFeature superclasses. | |
52 Example : $daf = new DnaDnaAlignFeature(-cigar_string => '3M3I12M'); | |
53 Description: Creates a new DnaDnaAlignFeature using either a cigarstring or | |
54 a list of ungapped features. | |
55 Returntype : Bio::EnsEMBL::DnaDnaAlignFeature | |
56 Exceptions : none | |
57 Caller : general | |
58 Status : Stable | |
59 | |
60 =cut | |
61 | |
62 sub new { | |
63 | |
64 my $caller = shift; | |
65 | |
66 my $class = ref($caller) || $caller; | |
67 | |
68 my $self = $class->SUPER::new(@_); | |
69 | |
70 my ($pair_dna_align_feature_id) = rearrange([qw(PAIR_DNA_ALIGN_FEATURE_ID)], @_); | |
71 if (defined $pair_dna_align_feature_id){ | |
72 $self->{'pair_dna_align_feature_id'} = $pair_dna_align_feature_id; | |
73 } | |
74 return $self; | |
75 } | |
76 | |
77 | |
78 =head2 pair_dna_align_feature_id | |
79 | |
80 Arg[1] : (optional) String $arg - value to set | |
81 Example : $self->pair_dna_align_feature_id($pair_feature_id); | |
82 Description: Getter/setter for attribute 'pair_dna_align_feature_id' | |
83 The id of the dna feature aligned | |
84 Returntype : String | |
85 Exceptions : none | |
86 Caller : general | |
87 Status : Stable | |
88 | |
89 =cut | |
90 | |
91 sub pair_dna_align_feature_id{ | |
92 my ($self, $arg) = @_; | |
93 if (defined $arg){ | |
94 $self->{pair_dna_align_feature_id} = $arg; | |
95 } | |
96 return $self->{pair_dna_align_feature_id}; | |
97 } | |
98 | |
99 =head2 _hit_unit | |
100 | |
101 Arg [1] : none | |
102 Description: PRIVATE implementation of abstract superclass method. Returns | |
103 1 as the 'unit' used for the hit sequence. | |
104 Returntype : int | |
105 Exceptions : none | |
106 Caller : Bio::EnsEMBL::BaseAlignFeature | |
107 Status : Stable | |
108 | |
109 =cut | |
110 | |
111 sub _hit_unit { | |
112 return 1; | |
113 } | |
114 | |
115 | |
116 | |
117 =head2 _query_unit | |
118 | |
119 Arg [1] : none | |
120 Description: PRIVATE implementation of abstract superclass method Returns | |
121 1 as the 'unit' used for the hit sequence. | |
122 Returntype : int | |
123 Exceptions : none | |
124 Caller : Bio::EnsEMBL::BaseAlignFeature | |
125 Status : Stable | |
126 | |
127 =cut | |
128 | |
129 sub _query_unit { | |
130 return 1; | |
131 } | |
132 | |
133 =head2 restrict_between_positions | |
134 | |
135 Arg [1] : int $start | |
136 Arg [2] : int $end | |
137 Arg [3] : string $flags | |
138 SEQ = $start and $end apply to the seq sequence | |
139 i.e. start and end methods | |
140 HSEQ = $start and $end apply to the hseq sequence | |
141 i.e. hstart and hend methods | |
142 Example : $daf->restrict_between_positions(150,543,"SEQ") | |
143 Description: Build a new DnaDnaAlignFeature object that fits within | |
144 the new specified coordinates and sequence reference, cutting | |
145 any pieces hanging upstream and downstream. | |
146 Returntype : Bio::EnsEMBL::DnaDnaAlignFeature object | |
147 Exceptions : | |
148 Caller : | |
149 Status : Stable | |
150 | |
151 =cut | |
152 | |
153 sub restrict_between_positions { | |
154 my ($self,$start,$end,$seqref) = @_; | |
155 | |
156 unless (defined $start && $start =~ /^\d+$/) { | |
157 $self->throw("The first argument is not defined or is not an integer"); | |
158 } | |
159 unless (defined $end && $end =~ /^\d+$/) { | |
160 $self->throw("The second argument is not defined or is not an integer"); | |
161 } | |
162 unless (defined $seqref && | |
163 ($seqref eq "SEQ" || $seqref eq "HSEQ")) { | |
164 $self->throw("The third argument is not defined or is not equal to 'SEQ' or 'HSEQ'"); | |
165 } | |
166 | |
167 # symbolic method references should be forbidden! | |
168 # need to be rewrite at some stage. | |
169 | |
170 my ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) = | |
171 qw(start end strand hstart hend hstrand); | |
172 | |
173 if ($seqref eq "HSEQ") { | |
174 ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) = | |
175 qw(hstart hend hstrand start end strand); | |
176 } | |
177 | |
178 my @restricted_features; | |
179 | |
180 foreach my $ungapped_feature ($self->ungapped_features) { | |
181 | |
182 if ($ungapped_feature->$start_method1() > $end || | |
183 $ungapped_feature->$end_method1() < $start) { | |
184 | |
185 next; | |
186 | |
187 } elsif ($ungapped_feature->$end_method1() <= $end && | |
188 $ungapped_feature->$start_method1() >= $start) { | |
189 | |
190 push @restricted_features, $ungapped_feature; | |
191 | |
192 } else { | |
193 | |
194 if ($ungapped_feature->$strand_method1() eq $ungapped_feature->$strand_method2()) { | |
195 | |
196 if ($ungapped_feature->$start_method1() < $start) { | |
197 | |
198 my $offset = $start - $ungapped_feature->$start_method1(); | |
199 $ungapped_feature->$start_method1($start); | |
200 $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset); | |
201 | |
202 } | |
203 if ($ungapped_feature->$end_method1() > $end) { | |
204 | |
205 my $offset = $ungapped_feature->$end_method1() - $end; | |
206 $ungapped_feature->$end_method1($end); | |
207 $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset); | |
208 | |
209 } | |
210 } else { | |
211 | |
212 if ($ungapped_feature->$start_method1() < $start) { | |
213 | |
214 my $offset = $start - $ungapped_feature->$start_method1(); | |
215 $ungapped_feature->$start_method1($start); | |
216 $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset); | |
217 | |
218 } | |
219 if ($ungapped_feature->$end_method1() > $end) { | |
220 | |
221 my $offset = $ungapped_feature->$end_method1() - $end; | |
222 $ungapped_feature->$end_method1($end); | |
223 $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset); | |
224 | |
225 } | |
226 } | |
227 | |
228 push @restricted_features, $ungapped_feature; | |
229 } | |
230 } | |
231 | |
232 if (scalar @restricted_features) { | |
233 my $DnaDnaAlignFeature = new Bio::EnsEMBL::DnaDnaAlignFeature('-features' =>\@restricted_features); | |
234 if (defined $self->slice) { | |
235 $DnaDnaAlignFeature->slice($self->slice); | |
236 } | |
237 if (defined $self->hslice) { | |
238 $DnaDnaAlignFeature->hslice($self->hslice); | |
239 } | |
240 return $DnaDnaAlignFeature; | |
241 } else { | |
242 return undef; | |
243 } | |
244 } | |
245 | |
246 =head2 alignment_strings | |
247 | |
248 Arg [1] : list of string $flags | |
249 FIX_SEQ = does not introduce gaps (dashes) in seq aligned sequence | |
250 and delete the corresponding insertions in hseq aligned sequence | |
251 FIX_HSEQ = does not introduce gaps (dashes) in hseq aligned sequence | |
252 and delete the corresponding insertions in seq aligned sequence | |
253 NO_SEQ = return the seq aligned sequence as an empty string | |
254 NO_HSEQ = return the hseq aligned sequence as an empty string | |
255 This 2 last flags would save a bit of time as doing so no querying to the core | |
256 database in done to get the sequence. | |
257 Example : $daf->alignment_strings or | |
258 $daf->alignment_strings("FIX_HSEQ") or | |
259 $daf->alignment_strings("NO_SEQ","FIX_SEQ") | |
260 Description: Allows to rebuild the alignment string of both the seq and hseq sequence | |
261 using the cigar_string information and the slice and hslice objects | |
262 Returntype : array reference containing 2 strings | |
263 the first corresponds to seq | |
264 the second corresponds to hseq | |
265 Exceptions : | |
266 Caller : | |
267 Status : Stable | |
268 | |
269 =cut | |
270 | |
271 | |
272 sub alignment_strings { | |
273 my ( $self, @flags ) = @_; | |
274 | |
275 # set the flags | |
276 my $seq_flag = 1; | |
277 my $hseq_flag = 1; | |
278 my $fix_seq_flag = 0; | |
279 my $fix_hseq_flag = 0; | |
280 | |
281 for my $flag ( @flags ) { | |
282 $seq_flag = 0 if ($flag eq "NO_SEQ"); | |
283 $hseq_flag = 0 if ($flag eq "NO_HSEQ"); | |
284 $fix_seq_flag = 1 if ($flag eq "FIX_SEQ"); | |
285 $fix_hseq_flag = 1 if ($flag eq "FIX_HSEQ"); | |
286 } | |
287 | |
288 my ($seq, $hseq); | |
289 $seq = $self->slice->subseq($self->start, $self->end, $self->strand) if ($seq_flag || $fix_seq_flag); | |
290 $hseq = $self->hslice->subseq($self->hstart, $self->hend, $self->hstrand) if ($hseq_flag || $fix_hseq_flag); | |
291 | |
292 my $rseq= ""; | |
293 # rseq - result sequence | |
294 my $rhseq= ""; | |
295 # rhseq - result hsequence | |
296 | |
297 my $seq_pos = 0; | |
298 my $hseq_pos = 0; | |
299 | |
300 my @cig = ( $self->cigar_string =~ /(\d*[DIM])/g ); | |
301 | |
302 for my $cigElem ( @cig ) { | |
303 my $cigType = substr( $cigElem, -1, 1 ); | |
304 my $cigCount = substr( $cigElem, 0 ,-1 ); | |
305 $cigCount = 1 unless $cigCount; | |
306 | |
307 if( $cigType eq "M" ) { | |
308 $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag); | |
309 $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag); | |
310 $seq_pos += $cigCount; | |
311 $hseq_pos += $cigCount; | |
312 } elsif( $cigType eq "D" ) { | |
313 if( ! $fix_seq_flag ) { | |
314 $rseq .= "-" x $cigCount if ($seq_flag); | |
315 $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag); | |
316 } | |
317 $hseq_pos += $cigCount; | |
318 } elsif( $cigType eq "I" ) { | |
319 if( ! $fix_hseq_flag ) { | |
320 $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag); | |
321 $rhseq .= "-" x $cigCount if ($hseq_flag); | |
322 } | |
323 $seq_pos += $cigCount; | |
324 } | |
325 } | |
326 return [ $rseq,$rhseq ]; | |
327 } | |
328 | |
329 =head2 get_SimpleAlign | |
330 | |
331 Arg [1] : list of string $flags | |
332 translated = by default, the sequence alignment will be on nucleotide. With translated flag | |
333 the aligned sequences are translated. | |
334 uc = by default aligned sequences are given in lower cases. With uc flag, the aligned | |
335 sequences are given in upper cases. | |
336 Example : $daf->get_SimpleAlign or | |
337 $daf->get_SimpleAlign("translated") or | |
338 $daf->get_SimpleAlign("translated","uc") | |
339 Description: Allows to rebuild the alignment string of both the seq and hseq sequence | |
340 using the cigar_string information and the slice and hslice objects | |
341 Returntype : a Bio::SimpleAlign object | |
342 Exceptions : | |
343 Caller : | |
344 Status : Stable | |
345 | |
346 =cut | |
347 | |
348 sub get_SimpleAlign { | |
349 my ( $self, @flags ) = @_; | |
350 | |
351 # setting the flags | |
352 my $uc = 0; | |
353 my $translated = 0; | |
354 | |
355 for my $flag ( @flags ) { | |
356 $uc = 1 if ($flag =~ /^uc$/i); | |
357 $translated = 1 if ($flag =~ /^translated$/i); | |
358 } | |
359 | |
360 my $sa = Bio::SimpleAlign->new(); | |
361 | |
362 #Hack to try to work with both bioperl 0.7 and 1.2: | |
363 #Check to see if the method is called 'addSeq' or 'add_seq' | |
364 my $bio07 = 0; | |
365 if(!$sa->can('add_seq')) { | |
366 $bio07 = 1; | |
367 } | |
368 | |
369 my ($sb_seq,$qy_seq) = @{$self->alignment_strings}; | |
370 | |
371 my $loc_sb_seq = Bio::LocatableSeq->new(-SEQ => $uc ? uc $sb_seq : lc $sb_seq, | |
372 -START => $self->seq_region_start, | |
373 -END => $self->seq_region_end, | |
374 -ID => $self->seqname, | |
375 -STRAND => $self->strand); | |
376 | |
377 $loc_sb_seq->seq($uc ? uc $loc_sb_seq->translate->seq | |
378 : lc $loc_sb_seq->translate->seq) if ($translated); | |
379 | |
380 my $loc_qy_seq = Bio::LocatableSeq->new(-SEQ => $uc ? uc $qy_seq : lc $qy_seq, | |
381 -START => $self->hseq_region_start, | |
382 -END => $self->hseq_region_end, | |
383 -ID => $self->hseqname, | |
384 -STRAND => $self->hstrand); | |
385 | |
386 $loc_qy_seq->seq($uc ? uc $loc_qy_seq->translate->seq | |
387 : lc $loc_qy_seq->translate->seq) if ($translated); | |
388 | |
389 if($bio07) { | |
390 $sa->addSeq($loc_sb_seq); | |
391 $sa->addSeq($loc_qy_seq); | |
392 } else { | |
393 $sa->add_seq($loc_sb_seq); | |
394 $sa->add_seq($loc_qy_seq); | |
395 } | |
396 | |
397 return $sa; | |
398 } | |
399 | |
400 1; |