0
|
1 # $Id: LocatableSeq.pm,v 1.22.2.1 2003/03/31 11:49:51 heikki Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::LocatableSeq
|
|
4 #
|
|
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
|
|
6 #
|
|
7 # Copyright Ewan Birney
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::LocatableSeq - A Sequence object with start/end points on it
|
|
16 that can be projected into a MSA or have coordinates relative to another seq.
|
|
17
|
|
18 =head1 SYNOPSIS
|
|
19
|
|
20
|
|
21 use Bio::LocatableSeq;
|
|
22 my $seq = new Bio::LocatableSeq(-seq => "CAGT-GGT",
|
|
23 -id => "seq1",
|
|
24 -start => 1,
|
|
25 -end => 7);
|
|
26
|
|
27
|
|
28 =head1 DESCRIPTION
|
|
29
|
|
30
|
|
31 # a normal sequence object
|
|
32 $locseq->seq();
|
|
33 $locseq->id();
|
|
34
|
|
35 # has start,end points
|
|
36 $locseq->start();
|
|
37 $locseq->end();
|
|
38
|
|
39 # inheriets off RangeI, so range operations possible
|
|
40
|
|
41 =head1 FEEDBACK
|
|
42
|
|
43
|
|
44 =head2 Mailing Lists
|
|
45
|
|
46
|
|
47 User feedback is an integral part of the evolution of this and other
|
|
48 Bioperl modules. Send your comments and suggestions preferably to one
|
|
49 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
50
|
|
51 bioperl-l@bioperl.org - General discussion
|
|
52 http://bio.perl.org/MailList.html - About the mailing lists
|
|
53
|
|
54
|
|
55 The locatable sequence object was developed mainly because the
|
|
56 SimpleAlign object requires this functionality, and in the rewrite
|
|
57 of the Sequence object we had to decide what to do with this.
|
|
58
|
|
59 It is, to be honest, not well integrated with the rest of bioperl, for
|
|
60 example, the trunc() function does not return a LocatableSeq object,
|
|
61 as some might have thought. There are all sorts of nasty gotcha's
|
|
62 about interactions between coordinate systems when these sort of
|
|
63 objects are used.
|
|
64
|
|
65
|
|
66 =head2 Reporting Bugs
|
|
67
|
|
68 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
69 the bugs and their resolution. Bug reports can be submitted via email
|
|
70 or the web:
|
|
71
|
|
72 bioperl-bugs@bio.perl.org
|
|
73 http://bugzilla.bioperl.org/
|
|
74
|
|
75
|
|
76 =head1 APPENDIX
|
|
77
|
|
78 The rest of the documentation details each of the object
|
|
79 methods. Internal methods are usually preceded with a _
|
|
80
|
|
81 =cut
|
|
82
|
|
83 #'
|
|
84 # Let the code begin...
|
|
85
|
|
86 package Bio::LocatableSeq;
|
|
87 use vars qw(@ISA);
|
|
88 use strict;
|
|
89
|
|
90 use Bio::PrimarySeq;
|
|
91 use Bio::RangeI;
|
|
92 use Bio::Location::Simple;
|
|
93 use Bio::Location::Fuzzy;
|
|
94
|
|
95
|
|
96 @ISA = qw(Bio::PrimarySeq Bio::RangeI);
|
|
97
|
|
98 sub new {
|
|
99 my ($class, @args) = @_;
|
|
100 my $self = $class->SUPER::new(@args);
|
|
101
|
|
102 my ($start,$end,$strand) =
|
|
103 $self->_rearrange( [qw(START END STRAND)],
|
|
104 @args);
|
|
105
|
|
106 defined $start && $self->start($start);
|
|
107 defined $end && $self->end($end);
|
|
108 defined $strand && $self->strand($strand);
|
|
109
|
|
110 return $self; # success - we hope!
|
|
111 }
|
|
112
|
|
113 =head2 start
|
|
114
|
|
115 Title : start
|
|
116 Usage : $obj->start($newval)
|
|
117 Function:
|
|
118 Returns : value of start
|
|
119 Args : newvalue (optional)
|
|
120
|
|
121 =cut
|
|
122
|
|
123 sub start{
|
|
124 my $self = shift;
|
|
125 if( @_ ) {
|
|
126 my $value = shift;
|
|
127 $self->{'start'} = $value;
|
|
128 }
|
|
129 return $self->{'start'};
|
|
130
|
|
131 }
|
|
132
|
|
133 =head2 end
|
|
134
|
|
135 Title : end
|
|
136 Usage : $obj->end($newval)
|
|
137 Function:
|
|
138 Returns : value of end
|
|
139 Args : newvalue (optional)
|
|
140
|
|
141 =cut
|
|
142
|
|
143 sub end {
|
|
144 my $self = shift;
|
|
145 if( @_ ) {
|
|
146 my $value = shift;
|
|
147 my $string = $self->seq;
|
|
148 if ($string and $self->start) {
|
|
149 my $s2 = $string;
|
|
150 $string =~ s/[.-]+//g;
|
|
151 my $len = CORE::length $string;
|
|
152 my $new_end = $self->start + $len - 1 ;
|
|
153 my $id = $self->id;
|
|
154 $self->warn("In sequence $id residue count gives value $len.
|
|
155 Overriding value [$value] with value $new_end for Bio::LocatableSeq::end().")
|
|
156 and $value = $new_end if $new_end != $value and $self->verbose > 0;
|
|
157 }
|
|
158
|
|
159 $self->{'end'} = $value;
|
|
160 }
|
|
161 return $self->{'end'};
|
|
162
|
|
163 }
|
|
164
|
|
165 =head2 strand
|
|
166
|
|
167 Title : strand
|
|
168 Usage : $obj->strand($newval)
|
|
169 Function:
|
|
170 Returns : value of strand
|
|
171 Args : newvalue (optional)
|
|
172
|
|
173 =cut
|
|
174
|
|
175 sub strand{
|
|
176 my $self = shift;
|
|
177 if( @_ ) {
|
|
178 my $value = shift;
|
|
179 $self->{'strand'} = $value;
|
|
180 }
|
|
181 return $self->{'strand'};
|
|
182 }
|
|
183
|
|
184 =head2 get_nse
|
|
185
|
|
186 Title : get_nse
|
|
187 Usage :
|
|
188 Function: read-only name of form id/start-end
|
|
189 Example :
|
|
190 Returns :
|
|
191 Args :
|
|
192
|
|
193 =cut
|
|
194
|
|
195 sub get_nse{
|
|
196 my ($self,$char1,$char2) = @_;
|
|
197
|
|
198 $char1 ||= "/";
|
|
199 $char2 ||= "-";
|
|
200
|
|
201 $self->throw("Attribute id not set") unless $self->id();
|
|
202 $self->throw("Attribute start not set") unless $self->start();
|
|
203 $self->throw("Attribute end not set") unless $self->end();
|
|
204
|
|
205 return $self->id() . $char1 . $self->start . $char2 . $self->end ;
|
|
206
|
|
207 }
|
|
208
|
|
209
|
|
210 =head2 no_gap
|
|
211
|
|
212 Title : no_gaps
|
|
213 Usage :$self->no_gaps('.')
|
|
214 Function:
|
|
215
|
|
216 Gets number of gaps in the sequence. The count excludes
|
|
217 leading or trailing gap characters.
|
|
218
|
|
219 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
|
|
220 these, '.' and '-' are counted as gap characters unless an
|
|
221 optional argument specifies one of them.
|
|
222
|
|
223 Returns : number of internal gaps in the sequnce.
|
|
224 Args : a gap character (optional)
|
|
225
|
|
226 =cut
|
|
227
|
|
228 sub no_gaps {
|
|
229 my ($self,$char) = @_;
|
|
230 my ($seq, $count) = (undef, 0);
|
|
231
|
|
232 # default gap characters
|
|
233 $char ||= '-.';
|
|
234
|
|
235 $self->warn("I hope you know what you are doing setting gap to [$char]")
|
|
236 unless $char =~ /[-.]/;
|
|
237
|
|
238 $seq = $self->seq;
|
|
239 return 0 unless $seq; # empty sequence does not have gaps
|
|
240
|
|
241 $seq =~ s/^([$char]+)//;
|
|
242 $seq =~ s/([$char]+)$//;
|
|
243 $count++ while $seq =~ /[$char]+/g;
|
|
244
|
|
245 return $count;
|
|
246
|
|
247 }
|
|
248
|
|
249
|
|
250 =head2 column_from_residue_number
|
|
251
|
|
252 Title : column_from_residue_number
|
|
253 Usage : $col = $seq->column_from_residue_number($resnumber)
|
|
254 Function:
|
|
255
|
|
256 This function gives the position in the alignment
|
|
257 (i.e. column number) of the given residue number in the
|
|
258 sequence. For example, for the sequence
|
|
259
|
|
260 Seq1/91-97 AC..DEF.GH
|
|
261
|
|
262 column_from_residue_number(94) returns 5.
|
|
263
|
|
264 An exception is thrown if the residue number would lie
|
|
265 outside the length of the aligment
|
|
266 (e.g. column_from_residue_number( "Seq2", 22 )
|
|
267
|
|
268 Returns : A column number for the position of the
|
|
269 given residue in the given sequence (1 = first column)
|
|
270 Args : A residue number in the whole sequence (not just that
|
|
271 segment of it in the alignment)
|
|
272
|
|
273 =cut
|
|
274
|
|
275 sub column_from_residue_number {
|
|
276 my ($self, $resnumber) = @_;
|
|
277
|
|
278 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
|
|
279 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
|
|
280
|
|
281 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
|
|
282 my @residues = split //, $self->seq;
|
|
283 my $count = $self->start();
|
|
284 my $i;
|
|
285 for ($i=0; $i < @residues; $i++) {
|
|
286 if ($residues[$i] ne '.' and $residues[$i] ne '-') {
|
|
287 $count == $resnumber and last;
|
|
288 $count++;
|
|
289 }
|
|
290 }
|
|
291 # $i now holds the index of the column.
|
|
292 # The actual column number is this index + 1
|
|
293
|
|
294 return $i+1;
|
|
295 }
|
|
296
|
|
297 $self->throw("Could not find residue number $resnumber");
|
|
298
|
|
299 }
|
|
300
|
|
301
|
|
302 =head2 location_from_column
|
|
303
|
|
304 Title : location_from_column
|
|
305 Usage : $loc = $ali->location_from_column( $seq_number, $column_number)
|
|
306 Function:
|
|
307
|
|
308 This function gives the residue number in the sequence with
|
|
309 the given name for a given position in the alignment
|
|
310 (i.e. column number) of the given. Gaps complicate this
|
|
311 process and force the output to be a L<Bio::Range> where
|
|
312 values can be undefined. For example, for the alignment
|
|
313
|
|
314 Seq1/91-97 AC..DEF.G.
|
|
315 Seq2/1-9 .CYHDEFKGK
|
|
316
|
|
317 location_from_column( Seq1/91-97, 3 ) position 93
|
|
318 location_from_column( Seq1/91-97, 2 ) position 92^93
|
|
319 location_from_column( Seq1/91-97, 10) position 97^98
|
|
320 location_from_column( Seq2/1-9, 1 ) position undef
|
|
321
|
|
322 An exact position returns a Bio::Location::Simple object
|
|
323 where where location_type() returns 'EXACT', if a position
|
|
324 is between bases location_type() returns 'IN-BETWEEN'.
|
|
325 Column before the first residue returns undef. Note that if
|
|
326 the position is after the last residue in the alignment,
|
|
327 that there is no guarantee that the original sequence has
|
|
328 residues after that position.
|
|
329
|
|
330 An exception is thrown if the column number is not within
|
|
331 the sequence.
|
|
332
|
|
333 Returns : Bio::Location::Simple or undef
|
|
334 Args : A column number
|
|
335 Throws : If column is not within the sequence
|
|
336
|
|
337 See L<Bio::Location::Simple> for more.
|
|
338
|
|
339 =cut
|
|
340
|
|
341 sub location_from_column {
|
|
342 my ($self, $column) = @_;
|
|
343
|
|
344 $self->throw("Column number has to be a positive integer, not [$column]")
|
|
345 unless $column =~ /^\d+$/ and $column > 0;
|
|
346 $self->throw("Column number [column] is larger than".
|
|
347 " sequence length [". $self->length. "]")
|
|
348 unless $column <= $self->length;
|
|
349
|
|
350 my ($loc);
|
|
351 my $s = $self->subseq(1,$column);
|
|
352 $s =~ s/\W//g;
|
|
353 my $pos = CORE::length $s;
|
|
354
|
|
355 my $start = $self->start || 0 ;
|
|
356 if ($self->subseq($column, $column) =~ /[a-zA-Z]/ ) {
|
|
357 $loc = new Bio::Location::Simple
|
|
358 (-start => $pos + $start - 1,
|
|
359 -end => $pos + $start - 1,
|
|
360 -strand => 1
|
|
361 );
|
|
362 }
|
|
363 elsif ($pos == 0 and $self->start == 1) {
|
|
364 } else {
|
|
365 $loc = new Bio::Location::Simple
|
|
366 (-start => $pos + $start - 1,
|
|
367 -end => $pos +1 + $start - 1,
|
|
368 -strand => 1,
|
|
369 -location_type => 'IN-BETWEEN'
|
|
370 );
|
|
371 }
|
|
372 return $loc;
|
|
373 }
|
|
374
|
|
375 =head2 revcom
|
|
376
|
|
377 Title : revcom
|
|
378 Usage : $rev = $seq->revcom()
|
|
379 Function: Produces a new Bio::LocatableSeq object which
|
|
380 has the reversed complement of the sequence. For protein
|
|
381 sequences this throws an exception of "Sequence is a
|
|
382 protein. Cannot revcom"
|
|
383
|
|
384 Returns : A new Bio::LocatableSeq object
|
|
385 Args : none
|
|
386
|
|
387 =cut
|
|
388
|
|
389 sub revcom {
|
|
390 my ($self) = @_;
|
|
391
|
|
392 my $new = $self->SUPER::revcom;
|
|
393 $new->strand($self->strand * -1);
|
|
394 $new->start($self->start) if $self->start;
|
|
395 $new->end($self->end) if $self->end;
|
|
396 return $new;
|
|
397 }
|
|
398
|
|
399
|
|
400 =head2 trunc
|
|
401
|
|
402 Title : trunc
|
|
403 Usage : $subseq = $myseq->trunc(10,100);
|
|
404 Function: Provides a truncation of a sequence,
|
|
405
|
|
406 Example :
|
|
407 Returns : a fresh Bio::PrimarySeqI implementing object
|
|
408 Args : Two integers denoting first and last columns of the
|
|
409 sequence to be included into sub-sequence.
|
|
410
|
|
411
|
|
412 =cut
|
|
413
|
|
414 sub trunc {
|
|
415
|
|
416 my ($self, $start, $end) = @_;
|
|
417 my $new = $self->SUPER::trunc($start, $end);
|
|
418
|
|
419 $start = $self->location_from_column($start);
|
|
420 $start ? ($start = $start->start) : ($start = 1);
|
|
421
|
|
422 $end = $self->location_from_column($end);
|
|
423 $end = $end->start if $end;
|
|
424
|
|
425 $new->strand($self->strand);
|
|
426 $new->start($start) if $start;
|
|
427 $new->end($end) if $end;
|
|
428
|
|
429 return $new;
|
|
430 }
|
|
431
|
|
432 1;
|