0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Utils::PolyA
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 my $seq; # a Bio::Seq object
|
|
28 my $polyA = Bio::EnsEMBL::Utils::PolyA->new();
|
|
29
|
|
30 # returns a new Bio::Seq object with the trimmed sequence
|
|
31 my $trimmed_seq = $polyA->clip($seq);
|
|
32
|
|
33 # cat put Ns in the place of the polyA/polyT tail
|
|
34 my $masked_seq = $polyA->mask($seq);
|
|
35
|
|
36 # can put in lower case the polyA/polyT using any flag:
|
|
37 my $softmasked_seq = $poly->mask( $seq, 'soft' );
|
|
38
|
|
39 =head1 DESCRIPTION
|
|
40
|
|
41 It reads a Bio::Seq object, it first finds out whether it has a
|
|
42 polyA or a polyT and then performs one operation in the seq string:
|
|
43 clipping, masking or softmasking. It then returns a new Bio::Seq
|
|
44 object with the new sequence.
|
|
45
|
|
46 =head1 METHODS
|
|
47
|
|
48 =cut
|
|
49
|
|
50 package Bio::EnsEMBL::Utils::PolyA;
|
|
51
|
|
52 use Bio::EnsEMBL::Root;
|
|
53 use Bio::Seq;
|
|
54 use vars qw(@ISA);
|
|
55
|
|
56 use strict;
|
|
57
|
|
58 @ISA = qw(Bio::EnsEMBL::Root);
|
|
59
|
|
60
|
|
61 =head2 new
|
|
62
|
|
63 =cut
|
|
64
|
|
65 sub new{
|
|
66 my ($class) = @_;
|
|
67 if (ref($class)){
|
|
68 $class = ref($class);
|
|
69 }
|
|
70 my $self = {};
|
|
71 bless($self,$class);
|
|
72
|
|
73 return $self;
|
|
74 }
|
|
75
|
|
76
|
|
77 ############################################################
|
|
78
|
|
79 sub clip{
|
|
80 my ($self, $bioseq) = @_;
|
|
81
|
|
82 # print STDERR "past a $bioseq\n";
|
|
83 my $seq = $bioseq->seq;
|
|
84 $self->_clip(1);
|
|
85 $self->_mask(0);
|
|
86 $self->_softmask(0);
|
|
87 my $new_seq = $self->_find_polyA($seq);
|
|
88 my $cdna = Bio::Seq->new();
|
|
89 if (length($new_seq) > 0){
|
|
90 $cdna->seq($new_seq);
|
|
91 }
|
|
92 else {
|
|
93 print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n";
|
|
94 print "Returning undef\n";
|
|
95 return undef;
|
|
96 }
|
|
97 $cdna->display_id( $bioseq->display_id );
|
|
98 $cdna->desc( $bioseq->desc );
|
|
99
|
|
100 return $cdna;
|
|
101 }
|
|
102
|
|
103 ############################################################
|
|
104
|
|
105 sub mask{
|
|
106 my ($self, $bioseq, $flag ) = @_;
|
|
107
|
|
108 my $seq = $bioseq->seq;
|
|
109 $self->_clip(0);
|
|
110 if ( $flag ){
|
|
111 $self->_mask(0);
|
|
112 $self->_softmask(1);
|
|
113 }
|
|
114 else{
|
|
115 $self->_mask(1);
|
|
116 $self->_softmask(0);
|
|
117 }
|
|
118 my $new_seq = $self->_find_polyA($seq);
|
|
119 my $cdna = new Bio::Seq;
|
|
120 $cdna->seq($new_seq);
|
|
121 $cdna->display_id( $bioseq->display_id );
|
|
122 $cdna->desc( $bioseq->desc );
|
|
123
|
|
124 return $cdna;
|
|
125 }
|
|
126
|
|
127 ############################################################
|
|
128
|
|
129
|
|
130
|
|
131
|
|
132 ############################################################
|
|
133
|
|
134 sub _find_polyA{
|
|
135 my ($self, $seq) = @_;
|
|
136 my $new_seq;
|
|
137 my $length = length($seq);
|
|
138
|
|
139 # is it a polyA or polyT?
|
|
140 my $check_polyT = substr( $seq, 0, 6 );
|
|
141
|
|
142 my $check_polyA = substr( $seq, -6 );
|
|
143
|
|
144 my $t_count = $check_polyT =~ tr/Tt//;
|
|
145 my $a_count = $check_polyA =~ tr/Aa//;
|
|
146
|
|
147 #### polyA ####
|
|
148 if ( $a_count >= 5 && $a_count > $t_count ){
|
|
149
|
|
150 # we calculate the number of bases we want to chop
|
|
151 my $length_to_mask = 0;
|
|
152
|
|
153 # we start with 3 bases
|
|
154 my ($piece, $count ) = (3,0);
|
|
155
|
|
156 # count also the number of Ns, consider the Ns as potential As
|
|
157 my $n_count = 0;
|
|
158
|
|
159 # take 3 by 3 bases from the end
|
|
160 while( $length_to_mask < $length ){
|
|
161 my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece);
|
|
162 $count = $chunk =~ tr/Aa//;
|
|
163 $n_count = $chunk =~ tr/Nn//;
|
|
164 if ( ($count + $n_count) >= 2*( $piece )/3 ){
|
|
165 $length_to_mask += 3;
|
|
166 }
|
|
167 else{
|
|
168 last;
|
|
169 }
|
|
170 }
|
|
171
|
|
172 if ( $length_to_mask > 0 ){
|
|
173 # do not mask the last base if it is not an A:
|
|
174 my $last_base = substr( $seq, ( $length - $length_to_mask ), 1);
|
|
175 my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1);
|
|
176 if ( !( $last_base eq 'A' || $last_base eq 'a') ){
|
|
177 $length_to_mask--;
|
|
178 }
|
|
179 elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){
|
|
180 $length_to_mask++;
|
|
181 }
|
|
182 my $clipped_seq = substr( $seq, 0, $length - $length_to_mask );
|
|
183 my $mask;
|
|
184 if ( $self->_clip ){
|
|
185 $mask = "";
|
|
186 }
|
|
187 elsif( $self->_mask ){
|
|
188 $mask = "N" x ($length_to_mask);
|
|
189 }
|
|
190 elsif ( $self->_softmask ){
|
|
191 $mask = lc substr( $seq, ( $length - $length_to_mask ) );
|
|
192 }
|
|
193 $new_seq = $clipped_seq . $mask;
|
|
194 }
|
|
195 else{
|
|
196 $new_seq = $seq;
|
|
197 }
|
|
198 }
|
|
199 #### polyT ####
|
|
200 elsif( $t_count >=5 && $t_count > $a_count ){
|
|
201
|
|
202 # calculate the number of bases to chop
|
|
203 my $length_to_mask = -3;
|
|
204
|
|
205 # we start with 3 bases:
|
|
206 my ($piece, $count) = (3,3);
|
|
207
|
|
208 # count also the number of Ns, consider the Ns as potential As
|
|
209 my $n_count = 0;
|
|
210
|
|
211 # take 3 by 3 bases from the beginning
|
|
212 while ( $length_to_mask < $length ){
|
|
213 my $chunk = substr( $seq, $length_to_mask + 3, $piece );
|
|
214 #print STDERR "length to mask: $length_to_mask\n";
|
|
215 #print "chunk: $chunk\n";
|
|
216 $count = $chunk =~ tr/Tt//;
|
|
217 $n_count = $chunk =~ tr/Nn//;
|
|
218 if ( ($count+$n_count) >= 2*( $piece )/3 ){
|
|
219 $length_to_mask +=3;
|
|
220 }
|
|
221 else{
|
|
222 last;
|
|
223
|
|
224 }
|
|
225 }
|
|
226 if ( $length_to_mask >= 0 ){
|
|
227 # do not chop the last base if it is not a A:
|
|
228 #print STDERR "clipping sequence $seq\n";
|
|
229 my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 );
|
|
230 my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 );
|
|
231 if ( !( $last_base eq 'T' || $last_base eq 't' ) ){
|
|
232 $length_to_mask--;
|
|
233 }
|
|
234 elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){
|
|
235 $length_to_mask++;
|
|
236 }
|
|
237 my $clipped_seq = substr( $seq, $length_to_mask + 3);
|
|
238 my $mask;
|
|
239 if ( $self->_clip ){
|
|
240 $mask = "";
|
|
241 }
|
|
242 elsif( $self->_mask ){
|
|
243 $mask = "N" x ($length_to_mask+3);
|
|
244 }
|
|
245 elsif ($self->_softmask){
|
|
246 $mask = lc substr( $seq, 0, ($length_to_mask + 3) );
|
|
247 }
|
|
248 $new_seq = $mask.$clipped_seq;
|
|
249 }
|
|
250 else{
|
|
251 $new_seq = $seq;
|
|
252 }
|
|
253 }
|
|
254 else{
|
|
255 # we cannot be sure of what it is
|
|
256 # do not clip
|
|
257 $new_seq = $seq;
|
|
258 }
|
|
259
|
|
260 return $new_seq;
|
|
261 }
|
|
262
|
|
263 ############################################################
|
|
264
|
|
265 sub _mask{
|
|
266 my ($self,$mask) = @_;
|
|
267 if (defined($mask)){
|
|
268 $self->{_mask} = $mask;
|
|
269 }
|
|
270 return $self->{_mask};
|
|
271 }
|
|
272
|
|
273 ############################################################
|
|
274
|
|
275 sub _clip{
|
|
276 my ($self,$clip) = @_;
|
|
277 if (defined($clip)){
|
|
278 $self->{_clip} = $clip;
|
|
279 }
|
|
280 return $self->{_clip};
|
|
281 }
|
|
282
|
|
283 ############################################################
|
|
284
|
|
285 sub _softmask{
|
|
286 my ($self,$softmask) = @_;
|
|
287 if (defined($softmask)){
|
|
288 $self->{_softmask} = $softmask;
|
|
289 }
|
|
290 return $self->{_softmask};
|
|
291 }
|
|
292
|
|
293 ############################################################
|
|
294
|
|
295
|
|
296 sub has_polyA_track{
|
|
297 my ($self, $seq) = @_;
|
|
298 my $new_seq;
|
|
299 my $length = length($seq);
|
|
300
|
|
301 # is it a polyA or polyT?
|
|
302 my $check_polyT = substr( $seq, 0, 10 );
|
|
303
|
|
304 my $check_polyA = substr( $seq, -10 );
|
|
305
|
|
306 print STDERR "polyA: $check_polyA\n";
|
|
307
|
|
308 my $t_count = $check_polyT =~ tr/Tt//;
|
|
309 my $a_count = $check_polyA =~ tr/Aa//;
|
|
310
|
|
311 ## testing with this short cut
|
|
312 if ( $a_count >=7 || $t_count >=7 ){
|
|
313 return 1;
|
|
314 }
|
|
315 else{
|
|
316 return 0;
|
|
317 }
|
|
318 }
|
|
319
|
|
320
|
|
321 ################
|
|
322 1;
|