Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/DnaDnaAlignFeature.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 4 Genome Research Limited. All rights reserved. | |
| 5 | |
| 6 This software is distributed under a modified Apache license. | |
| 7 For license details, please see | |
| 8 | |
| 9 http://www.ensembl.org/info/about/code_licence.html | |
| 10 | |
| 11 =head1 CONTACT | |
| 12 | |
| 13 Please email comments or questions to the public Ensembl | |
| 14 developers list at <dev@ensembl.org>. | |
| 15 | |
| 16 Questions may also be sent to the Ensembl help desk at | |
| 17 <helpdesk@ensembl.org>. | |
| 18 | |
| 19 =cut | |
| 20 | |
| 21 =head1 NAME | |
| 22 | |
| 23 Bio::EnsEMBL::DnaDnaAlignFeature - Ensembl specific dna-dna pairwise | |
| 24 alignment feature | |
| 25 | |
| 26 =head1 SYNOPSIS | |
| 27 | |
| 28 See BaseAlignFeature | |
| 29 | |
| 30 =cut | |
| 31 | |
| 32 | |
| 33 package Bio::EnsEMBL::DnaDnaAlignFeature; | |
| 34 | |
| 35 use strict; | |
| 36 | |
| 37 use Bio::EnsEMBL::BaseAlignFeature; | |
| 38 | |
| 39 use vars qw(@ISA); | |
| 40 use Bio::SimpleAlign; | |
| 41 use Bio::LocatableSeq; | |
| 42 use Bio::EnsEMBL::Utils::Argument qw(rearrange); | |
| 43 | |
| 44 @ISA = qw( Bio::EnsEMBL::BaseAlignFeature ); | |
| 45 | |
| 46 | |
| 47 =head2 new | |
| 48 | |
| 49 Arg [..] : List of named arguments. (-pair_dna_align_feature_id) defined | |
| 50 in this constructor, others defined in BaseFeaturePair and | |
| 51 SeqFeature superclasses. | |
| 52 Example : $daf = new DnaDnaAlignFeature(-cigar_string => '3M3I12M'); | |
| 53 Description: Creates a new DnaDnaAlignFeature using either a cigarstring or | |
| 54 a list of ungapped features. | |
| 55 Returntype : Bio::EnsEMBL::DnaDnaAlignFeature | |
| 56 Exceptions : none | |
| 57 Caller : general | |
| 58 Status : Stable | |
| 59 | |
| 60 =cut | |
| 61 | |
| 62 sub new { | |
| 63 | |
| 64 my $caller = shift; | |
| 65 | |
| 66 my $class = ref($caller) || $caller; | |
| 67 | |
| 68 my $self = $class->SUPER::new(@_); | |
| 69 | |
| 70 my ($pair_dna_align_feature_id) = rearrange([qw(PAIR_DNA_ALIGN_FEATURE_ID)], @_); | |
| 71 if (defined $pair_dna_align_feature_id){ | |
| 72 $self->{'pair_dna_align_feature_id'} = $pair_dna_align_feature_id; | |
| 73 } | |
| 74 return $self; | |
| 75 } | |
| 76 | |
| 77 | |
| 78 =head2 pair_dna_align_feature_id | |
| 79 | |
| 80 Arg[1] : (optional) String $arg - value to set | |
| 81 Example : $self->pair_dna_align_feature_id($pair_feature_id); | |
| 82 Description: Getter/setter for attribute 'pair_dna_align_feature_id' | |
| 83 The id of the dna feature aligned | |
| 84 Returntype : String | |
| 85 Exceptions : none | |
| 86 Caller : general | |
| 87 Status : Stable | |
| 88 | |
| 89 =cut | |
| 90 | |
| 91 sub pair_dna_align_feature_id{ | |
| 92 my ($self, $arg) = @_; | |
| 93 if (defined $arg){ | |
| 94 $self->{pair_dna_align_feature_id} = $arg; | |
| 95 } | |
| 96 return $self->{pair_dna_align_feature_id}; | |
| 97 } | |
| 98 | |
| 99 =head2 _hit_unit | |
| 100 | |
| 101 Arg [1] : none | |
| 102 Description: PRIVATE implementation of abstract superclass method. Returns | |
| 103 1 as the 'unit' used for the hit sequence. | |
| 104 Returntype : int | |
| 105 Exceptions : none | |
| 106 Caller : Bio::EnsEMBL::BaseAlignFeature | |
| 107 Status : Stable | |
| 108 | |
| 109 =cut | |
| 110 | |
| 111 sub _hit_unit { | |
| 112 return 1; | |
| 113 } | |
| 114 | |
| 115 | |
| 116 | |
| 117 =head2 _query_unit | |
| 118 | |
| 119 Arg [1] : none | |
| 120 Description: PRIVATE implementation of abstract superclass method Returns | |
| 121 1 as the 'unit' used for the hit sequence. | |
| 122 Returntype : int | |
| 123 Exceptions : none | |
| 124 Caller : Bio::EnsEMBL::BaseAlignFeature | |
| 125 Status : Stable | |
| 126 | |
| 127 =cut | |
| 128 | |
| 129 sub _query_unit { | |
| 130 return 1; | |
| 131 } | |
| 132 | |
| 133 =head2 restrict_between_positions | |
| 134 | |
| 135 Arg [1] : int $start | |
| 136 Arg [2] : int $end | |
| 137 Arg [3] : string $flags | |
| 138 SEQ = $start and $end apply to the seq sequence | |
| 139 i.e. start and end methods | |
| 140 HSEQ = $start and $end apply to the hseq sequence | |
| 141 i.e. hstart and hend methods | |
| 142 Example : $daf->restrict_between_positions(150,543,"SEQ") | |
| 143 Description: Build a new DnaDnaAlignFeature object that fits within | |
| 144 the new specified coordinates and sequence reference, cutting | |
| 145 any pieces hanging upstream and downstream. | |
| 146 Returntype : Bio::EnsEMBL::DnaDnaAlignFeature object | |
| 147 Exceptions : | |
| 148 Caller : | |
| 149 Status : Stable | |
| 150 | |
| 151 =cut | |
| 152 | |
| 153 sub restrict_between_positions { | |
| 154 my ($self,$start,$end,$seqref) = @_; | |
| 155 | |
| 156 unless (defined $start && $start =~ /^\d+$/) { | |
| 157 $self->throw("The first argument is not defined or is not an integer"); | |
| 158 } | |
| 159 unless (defined $end && $end =~ /^\d+$/) { | |
| 160 $self->throw("The second argument is not defined or is not an integer"); | |
| 161 } | |
| 162 unless (defined $seqref && | |
| 163 ($seqref eq "SEQ" || $seqref eq "HSEQ")) { | |
| 164 $self->throw("The third argument is not defined or is not equal to 'SEQ' or 'HSEQ'"); | |
| 165 } | |
| 166 | |
| 167 # symbolic method references should be forbidden! | |
| 168 # need to be rewrite at some stage. | |
| 169 | |
| 170 my ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) = | |
| 171 qw(start end strand hstart hend hstrand); | |
| 172 | |
| 173 if ($seqref eq "HSEQ") { | |
| 174 ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) = | |
| 175 qw(hstart hend hstrand start end strand); | |
| 176 } | |
| 177 | |
| 178 my @restricted_features; | |
| 179 | |
| 180 foreach my $ungapped_feature ($self->ungapped_features) { | |
| 181 | |
| 182 if ($ungapped_feature->$start_method1() > $end || | |
| 183 $ungapped_feature->$end_method1() < $start) { | |
| 184 | |
| 185 next; | |
| 186 | |
| 187 } elsif ($ungapped_feature->$end_method1() <= $end && | |
| 188 $ungapped_feature->$start_method1() >= $start) { | |
| 189 | |
| 190 push @restricted_features, $ungapped_feature; | |
| 191 | |
| 192 } else { | |
| 193 | |
| 194 if ($ungapped_feature->$strand_method1() eq $ungapped_feature->$strand_method2()) { | |
| 195 | |
| 196 if ($ungapped_feature->$start_method1() < $start) { | |
| 197 | |
| 198 my $offset = $start - $ungapped_feature->$start_method1(); | |
| 199 $ungapped_feature->$start_method1($start); | |
| 200 $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset); | |
| 201 | |
| 202 } | |
| 203 if ($ungapped_feature->$end_method1() > $end) { | |
| 204 | |
| 205 my $offset = $ungapped_feature->$end_method1() - $end; | |
| 206 $ungapped_feature->$end_method1($end); | |
| 207 $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset); | |
| 208 | |
| 209 } | |
| 210 } else { | |
| 211 | |
| 212 if ($ungapped_feature->$start_method1() < $start) { | |
| 213 | |
| 214 my $offset = $start - $ungapped_feature->$start_method1(); | |
| 215 $ungapped_feature->$start_method1($start); | |
| 216 $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset); | |
| 217 | |
| 218 } | |
| 219 if ($ungapped_feature->$end_method1() > $end) { | |
| 220 | |
| 221 my $offset = $ungapped_feature->$end_method1() - $end; | |
| 222 $ungapped_feature->$end_method1($end); | |
| 223 $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset); | |
| 224 | |
| 225 } | |
| 226 } | |
| 227 | |
| 228 push @restricted_features, $ungapped_feature; | |
| 229 } | |
| 230 } | |
| 231 | |
| 232 if (scalar @restricted_features) { | |
| 233 my $DnaDnaAlignFeature = new Bio::EnsEMBL::DnaDnaAlignFeature('-features' =>\@restricted_features); | |
| 234 if (defined $self->slice) { | |
| 235 $DnaDnaAlignFeature->slice($self->slice); | |
| 236 } | |
| 237 if (defined $self->hslice) { | |
| 238 $DnaDnaAlignFeature->hslice($self->hslice); | |
| 239 } | |
| 240 return $DnaDnaAlignFeature; | |
| 241 } else { | |
| 242 return undef; | |
| 243 } | |
| 244 } | |
| 245 | |
| 246 =head2 alignment_strings | |
| 247 | |
| 248 Arg [1] : list of string $flags | |
| 249 FIX_SEQ = does not introduce gaps (dashes) in seq aligned sequence | |
| 250 and delete the corresponding insertions in hseq aligned sequence | |
| 251 FIX_HSEQ = does not introduce gaps (dashes) in hseq aligned sequence | |
| 252 and delete the corresponding insertions in seq aligned sequence | |
| 253 NO_SEQ = return the seq aligned sequence as an empty string | |
| 254 NO_HSEQ = return the hseq aligned sequence as an empty string | |
| 255 This 2 last flags would save a bit of time as doing so no querying to the core | |
| 256 database in done to get the sequence. | |
| 257 Example : $daf->alignment_strings or | |
| 258 $daf->alignment_strings("FIX_HSEQ") or | |
| 259 $daf->alignment_strings("NO_SEQ","FIX_SEQ") | |
| 260 Description: Allows to rebuild the alignment string of both the seq and hseq sequence | |
| 261 using the cigar_string information and the slice and hslice objects | |
| 262 Returntype : array reference containing 2 strings | |
| 263 the first corresponds to seq | |
| 264 the second corresponds to hseq | |
| 265 Exceptions : | |
| 266 Caller : | |
| 267 Status : Stable | |
| 268 | |
| 269 =cut | |
| 270 | |
| 271 | |
| 272 sub alignment_strings { | |
| 273 my ( $self, @flags ) = @_; | |
| 274 | |
| 275 # set the flags | |
| 276 my $seq_flag = 1; | |
| 277 my $hseq_flag = 1; | |
| 278 my $fix_seq_flag = 0; | |
| 279 my $fix_hseq_flag = 0; | |
| 280 | |
| 281 for my $flag ( @flags ) { | |
| 282 $seq_flag = 0 if ($flag eq "NO_SEQ"); | |
| 283 $hseq_flag = 0 if ($flag eq "NO_HSEQ"); | |
| 284 $fix_seq_flag = 1 if ($flag eq "FIX_SEQ"); | |
| 285 $fix_hseq_flag = 1 if ($flag eq "FIX_HSEQ"); | |
| 286 } | |
| 287 | |
| 288 my ($seq, $hseq); | |
| 289 $seq = $self->slice->subseq($self->start, $self->end, $self->strand) if ($seq_flag || $fix_seq_flag); | |
| 290 $hseq = $self->hslice->subseq($self->hstart, $self->hend, $self->hstrand) if ($hseq_flag || $fix_hseq_flag); | |
| 291 | |
| 292 my $rseq= ""; | |
| 293 # rseq - result sequence | |
| 294 my $rhseq= ""; | |
| 295 # rhseq - result hsequence | |
| 296 | |
| 297 my $seq_pos = 0; | |
| 298 my $hseq_pos = 0; | |
| 299 | |
| 300 my @cig = ( $self->cigar_string =~ /(\d*[DIM])/g ); | |
| 301 | |
| 302 for my $cigElem ( @cig ) { | |
| 303 my $cigType = substr( $cigElem, -1, 1 ); | |
| 304 my $cigCount = substr( $cigElem, 0 ,-1 ); | |
| 305 $cigCount = 1 unless $cigCount; | |
| 306 | |
| 307 if( $cigType eq "M" ) { | |
| 308 $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag); | |
| 309 $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag); | |
| 310 $seq_pos += $cigCount; | |
| 311 $hseq_pos += $cigCount; | |
| 312 } elsif( $cigType eq "D" ) { | |
| 313 if( ! $fix_seq_flag ) { | |
| 314 $rseq .= "-" x $cigCount if ($seq_flag); | |
| 315 $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag); | |
| 316 } | |
| 317 $hseq_pos += $cigCount; | |
| 318 } elsif( $cigType eq "I" ) { | |
| 319 if( ! $fix_hseq_flag ) { | |
| 320 $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag); | |
| 321 $rhseq .= "-" x $cigCount if ($hseq_flag); | |
| 322 } | |
| 323 $seq_pos += $cigCount; | |
| 324 } | |
| 325 } | |
| 326 return [ $rseq,$rhseq ]; | |
| 327 } | |
| 328 | |
| 329 =head2 get_SimpleAlign | |
| 330 | |
| 331 Arg [1] : list of string $flags | |
| 332 translated = by default, the sequence alignment will be on nucleotide. With translated flag | |
| 333 the aligned sequences are translated. | |
| 334 uc = by default aligned sequences are given in lower cases. With uc flag, the aligned | |
| 335 sequences are given in upper cases. | |
| 336 Example : $daf->get_SimpleAlign or | |
| 337 $daf->get_SimpleAlign("translated") or | |
| 338 $daf->get_SimpleAlign("translated","uc") | |
| 339 Description: Allows to rebuild the alignment string of both the seq and hseq sequence | |
| 340 using the cigar_string information and the slice and hslice objects | |
| 341 Returntype : a Bio::SimpleAlign object | |
| 342 Exceptions : | |
| 343 Caller : | |
| 344 Status : Stable | |
| 345 | |
| 346 =cut | |
| 347 | |
| 348 sub get_SimpleAlign { | |
| 349 my ( $self, @flags ) = @_; | |
| 350 | |
| 351 # setting the flags | |
| 352 my $uc = 0; | |
| 353 my $translated = 0; | |
| 354 | |
| 355 for my $flag ( @flags ) { | |
| 356 $uc = 1 if ($flag =~ /^uc$/i); | |
| 357 $translated = 1 if ($flag =~ /^translated$/i); | |
| 358 } | |
| 359 | |
| 360 my $sa = Bio::SimpleAlign->new(); | |
| 361 | |
| 362 #Hack to try to work with both bioperl 0.7 and 1.2: | |
| 363 #Check to see if the method is called 'addSeq' or 'add_seq' | |
| 364 my $bio07 = 0; | |
| 365 if(!$sa->can('add_seq')) { | |
| 366 $bio07 = 1; | |
| 367 } | |
| 368 | |
| 369 my ($sb_seq,$qy_seq) = @{$self->alignment_strings}; | |
| 370 | |
| 371 my $loc_sb_seq = Bio::LocatableSeq->new(-SEQ => $uc ? uc $sb_seq : lc $sb_seq, | |
| 372 -START => $self->seq_region_start, | |
| 373 -END => $self->seq_region_end, | |
| 374 -ID => $self->seqname, | |
| 375 -STRAND => $self->strand); | |
| 376 | |
| 377 $loc_sb_seq->seq($uc ? uc $loc_sb_seq->translate->seq | |
| 378 : lc $loc_sb_seq->translate->seq) if ($translated); | |
| 379 | |
| 380 my $loc_qy_seq = Bio::LocatableSeq->new(-SEQ => $uc ? uc $qy_seq : lc $qy_seq, | |
| 381 -START => $self->hseq_region_start, | |
| 382 -END => $self->hseq_region_end, | |
| 383 -ID => $self->hseqname, | |
| 384 -STRAND => $self->hstrand); | |
| 385 | |
| 386 $loc_qy_seq->seq($uc ? uc $loc_qy_seq->translate->seq | |
| 387 : lc $loc_qy_seq->translate->seq) if ($translated); | |
| 388 | |
| 389 if($bio07) { | |
| 390 $sa->addSeq($loc_sb_seq); | |
| 391 $sa->addSeq($loc_qy_seq); | |
| 392 } else { | |
| 393 $sa->add_seq($loc_sb_seq); | |
| 394 $sa->add_seq($loc_qy_seq); | |
| 395 } | |
| 396 | |
| 397 return $sa; | |
| 398 } | |
| 399 | |
| 400 1; |
