Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/LocatableSeq.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: LocatableSeq.pm,v 1.22.2.1 2003/03/31 11:49:51 heikki Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::LocatableSeq | |
| 4 # | |
| 5 # Cared for by Ewan Birney <birney@sanger.ac.uk> | |
| 6 # | |
| 7 # Copyright Ewan Birney | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::LocatableSeq - A Sequence object with start/end points on it | |
| 16 that can be projected into a MSA or have coordinates relative to another seq. | |
| 17 | |
| 18 =head1 SYNOPSIS | |
| 19 | |
| 20 | |
| 21 use Bio::LocatableSeq; | |
| 22 my $seq = new Bio::LocatableSeq(-seq => "CAGT-GGT", | |
| 23 -id => "seq1", | |
| 24 -start => 1, | |
| 25 -end => 7); | |
| 26 | |
| 27 | |
| 28 =head1 DESCRIPTION | |
| 29 | |
| 30 | |
| 31 # a normal sequence object | |
| 32 $locseq->seq(); | |
| 33 $locseq->id(); | |
| 34 | |
| 35 # has start,end points | |
| 36 $locseq->start(); | |
| 37 $locseq->end(); | |
| 38 | |
| 39 # inheriets off RangeI, so range operations possible | |
| 40 | |
| 41 =head1 FEEDBACK | |
| 42 | |
| 43 | |
| 44 =head2 Mailing Lists | |
| 45 | |
| 46 | |
| 47 User feedback is an integral part of the evolution of this and other | |
| 48 Bioperl modules. Send your comments and suggestions preferably to one | |
| 49 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 50 | |
| 51 bioperl-l@bioperl.org - General discussion | |
| 52 http://bio.perl.org/MailList.html - About the mailing lists | |
| 53 | |
| 54 | |
| 55 The locatable sequence object was developed mainly because the | |
| 56 SimpleAlign object requires this functionality, and in the rewrite | |
| 57 of the Sequence object we had to decide what to do with this. | |
| 58 | |
| 59 It is, to be honest, not well integrated with the rest of bioperl, for | |
| 60 example, the trunc() function does not return a LocatableSeq object, | |
| 61 as some might have thought. There are all sorts of nasty gotcha's | |
| 62 about interactions between coordinate systems when these sort of | |
| 63 objects are used. | |
| 64 | |
| 65 | |
| 66 =head2 Reporting Bugs | |
| 67 | |
| 68 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 69 the bugs and their resolution. Bug reports can be submitted via email | |
| 70 or the web: | |
| 71 | |
| 72 bioperl-bugs@bio.perl.org | |
| 73 http://bugzilla.bioperl.org/ | |
| 74 | |
| 75 | |
| 76 =head1 APPENDIX | |
| 77 | |
| 78 The rest of the documentation details each of the object | |
| 79 methods. Internal methods are usually preceded with a _ | |
| 80 | |
| 81 =cut | |
| 82 | |
| 83 #' | |
| 84 # Let the code begin... | |
| 85 | |
| 86 package Bio::LocatableSeq; | |
| 87 use vars qw(@ISA); | |
| 88 use strict; | |
| 89 | |
| 90 use Bio::PrimarySeq; | |
| 91 use Bio::RangeI; | |
| 92 use Bio::Location::Simple; | |
| 93 use Bio::Location::Fuzzy; | |
| 94 | |
| 95 | |
| 96 @ISA = qw(Bio::PrimarySeq Bio::RangeI); | |
| 97 | |
| 98 sub new { | |
| 99 my ($class, @args) = @_; | |
| 100 my $self = $class->SUPER::new(@args); | |
| 101 | |
| 102 my ($start,$end,$strand) = | |
| 103 $self->_rearrange( [qw(START END STRAND)], | |
| 104 @args); | |
| 105 | |
| 106 defined $start && $self->start($start); | |
| 107 defined $end && $self->end($end); | |
| 108 defined $strand && $self->strand($strand); | |
| 109 | |
| 110 return $self; # success - we hope! | |
| 111 } | |
| 112 | |
| 113 =head2 start | |
| 114 | |
| 115 Title : start | |
| 116 Usage : $obj->start($newval) | |
| 117 Function: | |
| 118 Returns : value of start | |
| 119 Args : newvalue (optional) | |
| 120 | |
| 121 =cut | |
| 122 | |
| 123 sub start{ | |
| 124 my $self = shift; | |
| 125 if( @_ ) { | |
| 126 my $value = shift; | |
| 127 $self->{'start'} = $value; | |
| 128 } | |
| 129 return $self->{'start'}; | |
| 130 | |
| 131 } | |
| 132 | |
| 133 =head2 end | |
| 134 | |
| 135 Title : end | |
| 136 Usage : $obj->end($newval) | |
| 137 Function: | |
| 138 Returns : value of end | |
| 139 Args : newvalue (optional) | |
| 140 | |
| 141 =cut | |
| 142 | |
| 143 sub end { | |
| 144 my $self = shift; | |
| 145 if( @_ ) { | |
| 146 my $value = shift; | |
| 147 my $string = $self->seq; | |
| 148 if ($string and $self->start) { | |
| 149 my $s2 = $string; | |
| 150 $string =~ s/[.-]+//g; | |
| 151 my $len = CORE::length $string; | |
| 152 my $new_end = $self->start + $len - 1 ; | |
| 153 my $id = $self->id; | |
| 154 $self->warn("In sequence $id residue count gives value $len. | |
| 155 Overriding value [$value] with value $new_end for Bio::LocatableSeq::end().") | |
| 156 and $value = $new_end if $new_end != $value and $self->verbose > 0; | |
| 157 } | |
| 158 | |
| 159 $self->{'end'} = $value; | |
| 160 } | |
| 161 return $self->{'end'}; | |
| 162 | |
| 163 } | |
| 164 | |
| 165 =head2 strand | |
| 166 | |
| 167 Title : strand | |
| 168 Usage : $obj->strand($newval) | |
| 169 Function: | |
| 170 Returns : value of strand | |
| 171 Args : newvalue (optional) | |
| 172 | |
| 173 =cut | |
| 174 | |
| 175 sub strand{ | |
| 176 my $self = shift; | |
| 177 if( @_ ) { | |
| 178 my $value = shift; | |
| 179 $self->{'strand'} = $value; | |
| 180 } | |
| 181 return $self->{'strand'}; | |
| 182 } | |
| 183 | |
| 184 =head2 get_nse | |
| 185 | |
| 186 Title : get_nse | |
| 187 Usage : | |
| 188 Function: read-only name of form id/start-end | |
| 189 Example : | |
| 190 Returns : | |
| 191 Args : | |
| 192 | |
| 193 =cut | |
| 194 | |
| 195 sub get_nse{ | |
| 196 my ($self,$char1,$char2) = @_; | |
| 197 | |
| 198 $char1 ||= "/"; | |
| 199 $char2 ||= "-"; | |
| 200 | |
| 201 $self->throw("Attribute id not set") unless $self->id(); | |
| 202 $self->throw("Attribute start not set") unless $self->start(); | |
| 203 $self->throw("Attribute end not set") unless $self->end(); | |
| 204 | |
| 205 return $self->id() . $char1 . $self->start . $char2 . $self->end ; | |
| 206 | |
| 207 } | |
| 208 | |
| 209 | |
| 210 =head2 no_gap | |
| 211 | |
| 212 Title : no_gaps | |
| 213 Usage :$self->no_gaps('.') | |
| 214 Function: | |
| 215 | |
| 216 Gets number of gaps in the sequence. The count excludes | |
| 217 leading or trailing gap characters. | |
| 218 | |
| 219 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of | |
| 220 these, '.' and '-' are counted as gap characters unless an | |
| 221 optional argument specifies one of them. | |
| 222 | |
| 223 Returns : number of internal gaps in the sequnce. | |
| 224 Args : a gap character (optional) | |
| 225 | |
| 226 =cut | |
| 227 | |
| 228 sub no_gaps { | |
| 229 my ($self,$char) = @_; | |
| 230 my ($seq, $count) = (undef, 0); | |
| 231 | |
| 232 # default gap characters | |
| 233 $char ||= '-.'; | |
| 234 | |
| 235 $self->warn("I hope you know what you are doing setting gap to [$char]") | |
| 236 unless $char =~ /[-.]/; | |
| 237 | |
| 238 $seq = $self->seq; | |
| 239 return 0 unless $seq; # empty sequence does not have gaps | |
| 240 | |
| 241 $seq =~ s/^([$char]+)//; | |
| 242 $seq =~ s/([$char]+)$//; | |
| 243 $count++ while $seq =~ /[$char]+/g; | |
| 244 | |
| 245 return $count; | |
| 246 | |
| 247 } | |
| 248 | |
| 249 | |
| 250 =head2 column_from_residue_number | |
| 251 | |
| 252 Title : column_from_residue_number | |
| 253 Usage : $col = $seq->column_from_residue_number($resnumber) | |
| 254 Function: | |
| 255 | |
| 256 This function gives the position in the alignment | |
| 257 (i.e. column number) of the given residue number in the | |
| 258 sequence. For example, for the sequence | |
| 259 | |
| 260 Seq1/91-97 AC..DEF.GH | |
| 261 | |
| 262 column_from_residue_number(94) returns 5. | |
| 263 | |
| 264 An exception is thrown if the residue number would lie | |
| 265 outside the length of the aligment | |
| 266 (e.g. column_from_residue_number( "Seq2", 22 ) | |
| 267 | |
| 268 Returns : A column number for the position of the | |
| 269 given residue in the given sequence (1 = first column) | |
| 270 Args : A residue number in the whole sequence (not just that | |
| 271 segment of it in the alignment) | |
| 272 | |
| 273 =cut | |
| 274 | |
| 275 sub column_from_residue_number { | |
| 276 my ($self, $resnumber) = @_; | |
| 277 | |
| 278 $self->throw("Residue number has to be a positive integer, not [$resnumber]") | |
| 279 unless $resnumber =~ /^\d+$/ and $resnumber > 0; | |
| 280 | |
| 281 if ($resnumber >= $self->start() and $resnumber <= $self->end()) { | |
| 282 my @residues = split //, $self->seq; | |
| 283 my $count = $self->start(); | |
| 284 my $i; | |
| 285 for ($i=0; $i < @residues; $i++) { | |
| 286 if ($residues[$i] ne '.' and $residues[$i] ne '-') { | |
| 287 $count == $resnumber and last; | |
| 288 $count++; | |
| 289 } | |
| 290 } | |
| 291 # $i now holds the index of the column. | |
| 292 # The actual column number is this index + 1 | |
| 293 | |
| 294 return $i+1; | |
| 295 } | |
| 296 | |
| 297 $self->throw("Could not find residue number $resnumber"); | |
| 298 | |
| 299 } | |
| 300 | |
| 301 | |
| 302 =head2 location_from_column | |
| 303 | |
| 304 Title : location_from_column | |
| 305 Usage : $loc = $ali->location_from_column( $seq_number, $column_number) | |
| 306 Function: | |
| 307 | |
| 308 This function gives the residue number in the sequence with | |
| 309 the given name for a given position in the alignment | |
| 310 (i.e. column number) of the given. Gaps complicate this | |
| 311 process and force the output to be a L<Bio::Range> where | |
| 312 values can be undefined. For example, for the alignment | |
| 313 | |
| 314 Seq1/91-97 AC..DEF.G. | |
| 315 Seq2/1-9 .CYHDEFKGK | |
| 316 | |
| 317 location_from_column( Seq1/91-97, 3 ) position 93 | |
| 318 location_from_column( Seq1/91-97, 2 ) position 92^93 | |
| 319 location_from_column( Seq1/91-97, 10) position 97^98 | |
| 320 location_from_column( Seq2/1-9, 1 ) position undef | |
| 321 | |
| 322 An exact position returns a Bio::Location::Simple object | |
| 323 where where location_type() returns 'EXACT', if a position | |
| 324 is between bases location_type() returns 'IN-BETWEEN'. | |
| 325 Column before the first residue returns undef. Note that if | |
| 326 the position is after the last residue in the alignment, | |
| 327 that there is no guarantee that the original sequence has | |
| 328 residues after that position. | |
| 329 | |
| 330 An exception is thrown if the column number is not within | |
| 331 the sequence. | |
| 332 | |
| 333 Returns : Bio::Location::Simple or undef | |
| 334 Args : A column number | |
| 335 Throws : If column is not within the sequence | |
| 336 | |
| 337 See L<Bio::Location::Simple> for more. | |
| 338 | |
| 339 =cut | |
| 340 | |
| 341 sub location_from_column { | |
| 342 my ($self, $column) = @_; | |
| 343 | |
| 344 $self->throw("Column number has to be a positive integer, not [$column]") | |
| 345 unless $column =~ /^\d+$/ and $column > 0; | |
| 346 $self->throw("Column number [column] is larger than". | |
| 347 " sequence length [". $self->length. "]") | |
| 348 unless $column <= $self->length; | |
| 349 | |
| 350 my ($loc); | |
| 351 my $s = $self->subseq(1,$column); | |
| 352 $s =~ s/\W//g; | |
| 353 my $pos = CORE::length $s; | |
| 354 | |
| 355 my $start = $self->start || 0 ; | |
| 356 if ($self->subseq($column, $column) =~ /[a-zA-Z]/ ) { | |
| 357 $loc = new Bio::Location::Simple | |
| 358 (-start => $pos + $start - 1, | |
| 359 -end => $pos + $start - 1, | |
| 360 -strand => 1 | |
| 361 ); | |
| 362 } | |
| 363 elsif ($pos == 0 and $self->start == 1) { | |
| 364 } else { | |
| 365 $loc = new Bio::Location::Simple | |
| 366 (-start => $pos + $start - 1, | |
| 367 -end => $pos +1 + $start - 1, | |
| 368 -strand => 1, | |
| 369 -location_type => 'IN-BETWEEN' | |
| 370 ); | |
| 371 } | |
| 372 return $loc; | |
| 373 } | |
| 374 | |
| 375 =head2 revcom | |
| 376 | |
| 377 Title : revcom | |
| 378 Usage : $rev = $seq->revcom() | |
| 379 Function: Produces a new Bio::LocatableSeq object which | |
| 380 has the reversed complement of the sequence. For protein | |
| 381 sequences this throws an exception of "Sequence is a | |
| 382 protein. Cannot revcom" | |
| 383 | |
| 384 Returns : A new Bio::LocatableSeq object | |
| 385 Args : none | |
| 386 | |
| 387 =cut | |
| 388 | |
| 389 sub revcom { | |
| 390 my ($self) = @_; | |
| 391 | |
| 392 my $new = $self->SUPER::revcom; | |
| 393 $new->strand($self->strand * -1); | |
| 394 $new->start($self->start) if $self->start; | |
| 395 $new->end($self->end) if $self->end; | |
| 396 return $new; | |
| 397 } | |
| 398 | |
| 399 | |
| 400 =head2 trunc | |
| 401 | |
| 402 Title : trunc | |
| 403 Usage : $subseq = $myseq->trunc(10,100); | |
| 404 Function: Provides a truncation of a sequence, | |
| 405 | |
| 406 Example : | |
| 407 Returns : a fresh Bio::PrimarySeqI implementing object | |
| 408 Args : Two integers denoting first and last columns of the | |
| 409 sequence to be included into sub-sequence. | |
| 410 | |
| 411 | |
| 412 =cut | |
| 413 | |
| 414 sub trunc { | |
| 415 | |
| 416 my ($self, $start, $end) = @_; | |
| 417 my $new = $self->SUPER::trunc($start, $end); | |
| 418 | |
| 419 $start = $self->location_from_column($start); | |
| 420 $start ? ($start = $start->start) : ($start = 1); | |
| 421 | |
| 422 $end = $self->location_from_column($end); | |
| 423 $end = $end->start if $end; | |
| 424 | |
| 425 $new->strand($self->strand); | |
| 426 $new->start($start) if $start; | |
| 427 $new->end($end) if $end; | |
| 428 | |
| 429 return $new; | |
| 430 } | |
| 431 | |
| 432 1; |
