Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/LocatableSeq.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: LocatableSeq.pm,v 1.22.2.1 2003/03/31 11:49:51 heikki Exp $ | |
2 # | |
3 # BioPerl module for Bio::LocatableSeq | |
4 # | |
5 # Cared for by Ewan Birney <birney@sanger.ac.uk> | |
6 # | |
7 # Copyright Ewan Birney | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::LocatableSeq - A Sequence object with start/end points on it | |
16 that can be projected into a MSA or have coordinates relative to another seq. | |
17 | |
18 =head1 SYNOPSIS | |
19 | |
20 | |
21 use Bio::LocatableSeq; | |
22 my $seq = new Bio::LocatableSeq(-seq => "CAGT-GGT", | |
23 -id => "seq1", | |
24 -start => 1, | |
25 -end => 7); | |
26 | |
27 | |
28 =head1 DESCRIPTION | |
29 | |
30 | |
31 # a normal sequence object | |
32 $locseq->seq(); | |
33 $locseq->id(); | |
34 | |
35 # has start,end points | |
36 $locseq->start(); | |
37 $locseq->end(); | |
38 | |
39 # inheriets off RangeI, so range operations possible | |
40 | |
41 =head1 FEEDBACK | |
42 | |
43 | |
44 =head2 Mailing Lists | |
45 | |
46 | |
47 User feedback is an integral part of the evolution of this and other | |
48 Bioperl modules. Send your comments and suggestions preferably to one | |
49 of the Bioperl mailing lists. Your participation is much appreciated. | |
50 | |
51 bioperl-l@bioperl.org - General discussion | |
52 http://bio.perl.org/MailList.html - About the mailing lists | |
53 | |
54 | |
55 The locatable sequence object was developed mainly because the | |
56 SimpleAlign object requires this functionality, and in the rewrite | |
57 of the Sequence object we had to decide what to do with this. | |
58 | |
59 It is, to be honest, not well integrated with the rest of bioperl, for | |
60 example, the trunc() function does not return a LocatableSeq object, | |
61 as some might have thought. There are all sorts of nasty gotcha's | |
62 about interactions between coordinate systems when these sort of | |
63 objects are used. | |
64 | |
65 | |
66 =head2 Reporting Bugs | |
67 | |
68 Report bugs to the Bioperl bug tracking system to help us keep track | |
69 the bugs and their resolution. Bug reports can be submitted via email | |
70 or the web: | |
71 | |
72 bioperl-bugs@bio.perl.org | |
73 http://bugzilla.bioperl.org/ | |
74 | |
75 | |
76 =head1 APPENDIX | |
77 | |
78 The rest of the documentation details each of the object | |
79 methods. Internal methods are usually preceded with a _ | |
80 | |
81 =cut | |
82 | |
83 #' | |
84 # Let the code begin... | |
85 | |
86 package Bio::LocatableSeq; | |
87 use vars qw(@ISA); | |
88 use strict; | |
89 | |
90 use Bio::PrimarySeq; | |
91 use Bio::RangeI; | |
92 use Bio::Location::Simple; | |
93 use Bio::Location::Fuzzy; | |
94 | |
95 | |
96 @ISA = qw(Bio::PrimarySeq Bio::RangeI); | |
97 | |
98 sub new { | |
99 my ($class, @args) = @_; | |
100 my $self = $class->SUPER::new(@args); | |
101 | |
102 my ($start,$end,$strand) = | |
103 $self->_rearrange( [qw(START END STRAND)], | |
104 @args); | |
105 | |
106 defined $start && $self->start($start); | |
107 defined $end && $self->end($end); | |
108 defined $strand && $self->strand($strand); | |
109 | |
110 return $self; # success - we hope! | |
111 } | |
112 | |
113 =head2 start | |
114 | |
115 Title : start | |
116 Usage : $obj->start($newval) | |
117 Function: | |
118 Returns : value of start | |
119 Args : newvalue (optional) | |
120 | |
121 =cut | |
122 | |
123 sub start{ | |
124 my $self = shift; | |
125 if( @_ ) { | |
126 my $value = shift; | |
127 $self->{'start'} = $value; | |
128 } | |
129 return $self->{'start'}; | |
130 | |
131 } | |
132 | |
133 =head2 end | |
134 | |
135 Title : end | |
136 Usage : $obj->end($newval) | |
137 Function: | |
138 Returns : value of end | |
139 Args : newvalue (optional) | |
140 | |
141 =cut | |
142 | |
143 sub end { | |
144 my $self = shift; | |
145 if( @_ ) { | |
146 my $value = shift; | |
147 my $string = $self->seq; | |
148 if ($string and $self->start) { | |
149 my $s2 = $string; | |
150 $string =~ s/[.-]+//g; | |
151 my $len = CORE::length $string; | |
152 my $new_end = $self->start + $len - 1 ; | |
153 my $id = $self->id; | |
154 $self->warn("In sequence $id residue count gives value $len. | |
155 Overriding value [$value] with value $new_end for Bio::LocatableSeq::end().") | |
156 and $value = $new_end if $new_end != $value and $self->verbose > 0; | |
157 } | |
158 | |
159 $self->{'end'} = $value; | |
160 } | |
161 return $self->{'end'}; | |
162 | |
163 } | |
164 | |
165 =head2 strand | |
166 | |
167 Title : strand | |
168 Usage : $obj->strand($newval) | |
169 Function: | |
170 Returns : value of strand | |
171 Args : newvalue (optional) | |
172 | |
173 =cut | |
174 | |
175 sub strand{ | |
176 my $self = shift; | |
177 if( @_ ) { | |
178 my $value = shift; | |
179 $self->{'strand'} = $value; | |
180 } | |
181 return $self->{'strand'}; | |
182 } | |
183 | |
184 =head2 get_nse | |
185 | |
186 Title : get_nse | |
187 Usage : | |
188 Function: read-only name of form id/start-end | |
189 Example : | |
190 Returns : | |
191 Args : | |
192 | |
193 =cut | |
194 | |
195 sub get_nse{ | |
196 my ($self,$char1,$char2) = @_; | |
197 | |
198 $char1 ||= "/"; | |
199 $char2 ||= "-"; | |
200 | |
201 $self->throw("Attribute id not set") unless $self->id(); | |
202 $self->throw("Attribute start not set") unless $self->start(); | |
203 $self->throw("Attribute end not set") unless $self->end(); | |
204 | |
205 return $self->id() . $char1 . $self->start . $char2 . $self->end ; | |
206 | |
207 } | |
208 | |
209 | |
210 =head2 no_gap | |
211 | |
212 Title : no_gaps | |
213 Usage :$self->no_gaps('.') | |
214 Function: | |
215 | |
216 Gets number of gaps in the sequence. The count excludes | |
217 leading or trailing gap characters. | |
218 | |
219 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of | |
220 these, '.' and '-' are counted as gap characters unless an | |
221 optional argument specifies one of them. | |
222 | |
223 Returns : number of internal gaps in the sequnce. | |
224 Args : a gap character (optional) | |
225 | |
226 =cut | |
227 | |
228 sub no_gaps { | |
229 my ($self,$char) = @_; | |
230 my ($seq, $count) = (undef, 0); | |
231 | |
232 # default gap characters | |
233 $char ||= '-.'; | |
234 | |
235 $self->warn("I hope you know what you are doing setting gap to [$char]") | |
236 unless $char =~ /[-.]/; | |
237 | |
238 $seq = $self->seq; | |
239 return 0 unless $seq; # empty sequence does not have gaps | |
240 | |
241 $seq =~ s/^([$char]+)//; | |
242 $seq =~ s/([$char]+)$//; | |
243 $count++ while $seq =~ /[$char]+/g; | |
244 | |
245 return $count; | |
246 | |
247 } | |
248 | |
249 | |
250 =head2 column_from_residue_number | |
251 | |
252 Title : column_from_residue_number | |
253 Usage : $col = $seq->column_from_residue_number($resnumber) | |
254 Function: | |
255 | |
256 This function gives the position in the alignment | |
257 (i.e. column number) of the given residue number in the | |
258 sequence. For example, for the sequence | |
259 | |
260 Seq1/91-97 AC..DEF.GH | |
261 | |
262 column_from_residue_number(94) returns 5. | |
263 | |
264 An exception is thrown if the residue number would lie | |
265 outside the length of the aligment | |
266 (e.g. column_from_residue_number( "Seq2", 22 ) | |
267 | |
268 Returns : A column number for the position of the | |
269 given residue in the given sequence (1 = first column) | |
270 Args : A residue number in the whole sequence (not just that | |
271 segment of it in the alignment) | |
272 | |
273 =cut | |
274 | |
275 sub column_from_residue_number { | |
276 my ($self, $resnumber) = @_; | |
277 | |
278 $self->throw("Residue number has to be a positive integer, not [$resnumber]") | |
279 unless $resnumber =~ /^\d+$/ and $resnumber > 0; | |
280 | |
281 if ($resnumber >= $self->start() and $resnumber <= $self->end()) { | |
282 my @residues = split //, $self->seq; | |
283 my $count = $self->start(); | |
284 my $i; | |
285 for ($i=0; $i < @residues; $i++) { | |
286 if ($residues[$i] ne '.' and $residues[$i] ne '-') { | |
287 $count == $resnumber and last; | |
288 $count++; | |
289 } | |
290 } | |
291 # $i now holds the index of the column. | |
292 # The actual column number is this index + 1 | |
293 | |
294 return $i+1; | |
295 } | |
296 | |
297 $self->throw("Could not find residue number $resnumber"); | |
298 | |
299 } | |
300 | |
301 | |
302 =head2 location_from_column | |
303 | |
304 Title : location_from_column | |
305 Usage : $loc = $ali->location_from_column( $seq_number, $column_number) | |
306 Function: | |
307 | |
308 This function gives the residue number in the sequence with | |
309 the given name for a given position in the alignment | |
310 (i.e. column number) of the given. Gaps complicate this | |
311 process and force the output to be a L<Bio::Range> where | |
312 values can be undefined. For example, for the alignment | |
313 | |
314 Seq1/91-97 AC..DEF.G. | |
315 Seq2/1-9 .CYHDEFKGK | |
316 | |
317 location_from_column( Seq1/91-97, 3 ) position 93 | |
318 location_from_column( Seq1/91-97, 2 ) position 92^93 | |
319 location_from_column( Seq1/91-97, 10) position 97^98 | |
320 location_from_column( Seq2/1-9, 1 ) position undef | |
321 | |
322 An exact position returns a Bio::Location::Simple object | |
323 where where location_type() returns 'EXACT', if a position | |
324 is between bases location_type() returns 'IN-BETWEEN'. | |
325 Column before the first residue returns undef. Note that if | |
326 the position is after the last residue in the alignment, | |
327 that there is no guarantee that the original sequence has | |
328 residues after that position. | |
329 | |
330 An exception is thrown if the column number is not within | |
331 the sequence. | |
332 | |
333 Returns : Bio::Location::Simple or undef | |
334 Args : A column number | |
335 Throws : If column is not within the sequence | |
336 | |
337 See L<Bio::Location::Simple> for more. | |
338 | |
339 =cut | |
340 | |
341 sub location_from_column { | |
342 my ($self, $column) = @_; | |
343 | |
344 $self->throw("Column number has to be a positive integer, not [$column]") | |
345 unless $column =~ /^\d+$/ and $column > 0; | |
346 $self->throw("Column number [column] is larger than". | |
347 " sequence length [". $self->length. "]") | |
348 unless $column <= $self->length; | |
349 | |
350 my ($loc); | |
351 my $s = $self->subseq(1,$column); | |
352 $s =~ s/\W//g; | |
353 my $pos = CORE::length $s; | |
354 | |
355 my $start = $self->start || 0 ; | |
356 if ($self->subseq($column, $column) =~ /[a-zA-Z]/ ) { | |
357 $loc = new Bio::Location::Simple | |
358 (-start => $pos + $start - 1, | |
359 -end => $pos + $start - 1, | |
360 -strand => 1 | |
361 ); | |
362 } | |
363 elsif ($pos == 0 and $self->start == 1) { | |
364 } else { | |
365 $loc = new Bio::Location::Simple | |
366 (-start => $pos + $start - 1, | |
367 -end => $pos +1 + $start - 1, | |
368 -strand => 1, | |
369 -location_type => 'IN-BETWEEN' | |
370 ); | |
371 } | |
372 return $loc; | |
373 } | |
374 | |
375 =head2 revcom | |
376 | |
377 Title : revcom | |
378 Usage : $rev = $seq->revcom() | |
379 Function: Produces a new Bio::LocatableSeq object which | |
380 has the reversed complement of the sequence. For protein | |
381 sequences this throws an exception of "Sequence is a | |
382 protein. Cannot revcom" | |
383 | |
384 Returns : A new Bio::LocatableSeq object | |
385 Args : none | |
386 | |
387 =cut | |
388 | |
389 sub revcom { | |
390 my ($self) = @_; | |
391 | |
392 my $new = $self->SUPER::revcom; | |
393 $new->strand($self->strand * -1); | |
394 $new->start($self->start) if $self->start; | |
395 $new->end($self->end) if $self->end; | |
396 return $new; | |
397 } | |
398 | |
399 | |
400 =head2 trunc | |
401 | |
402 Title : trunc | |
403 Usage : $subseq = $myseq->trunc(10,100); | |
404 Function: Provides a truncation of a sequence, | |
405 | |
406 Example : | |
407 Returns : a fresh Bio::PrimarySeqI implementing object | |
408 Args : Two integers denoting first and last columns of the | |
409 sequence to be included into sub-sequence. | |
410 | |
411 | |
412 =cut | |
413 | |
414 sub trunc { | |
415 | |
416 my ($self, $start, $end) = @_; | |
417 my $new = $self->SUPER::trunc($start, $end); | |
418 | |
419 $start = $self->location_from_column($start); | |
420 $start ? ($start = $start->start) : ($start = 1); | |
421 | |
422 $end = $self->location_from_column($end); | |
423 $end = $end->start if $end; | |
424 | |
425 $new->strand($self->strand); | |
426 $new->start($start) if $start; | |
427 $new->end($end) if $end; | |
428 | |
429 return $new; | |
430 } | |
431 | |
432 1; |