comparison variant_effect_predictor/Bio/EnsEMBL/Utils/PolyA.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Utils::PolyA
24
25 =head1 SYNOPSIS
26
27 my $seq; # a Bio::Seq object
28 my $polyA = Bio::EnsEMBL::Utils::PolyA->new();
29
30 # returns a new Bio::Seq object with the trimmed sequence
31 my $trimmed_seq = $polyA->clip($seq);
32
33 # cat put Ns in the place of the polyA/polyT tail
34 my $masked_seq = $polyA->mask($seq);
35
36 # can put in lower case the polyA/polyT using any flag:
37 my $softmasked_seq = $poly->mask( $seq, 'soft' );
38
39 =head1 DESCRIPTION
40
41 It reads a Bio::Seq object, it first finds out whether it has a
42 polyA or a polyT and then performs one operation in the seq string:
43 clipping, masking or softmasking. It then returns a new Bio::Seq
44 object with the new sequence.
45
46 =head1 METHODS
47
48 =cut
49
50 package Bio::EnsEMBL::Utils::PolyA;
51
52 use Bio::EnsEMBL::Root;
53 use Bio::Seq;
54 use vars qw(@ISA);
55
56 use strict;
57
58 @ISA = qw(Bio::EnsEMBL::Root);
59
60
61 =head2 new
62
63 =cut
64
65 sub new{
66 my ($class) = @_;
67 if (ref($class)){
68 $class = ref($class);
69 }
70 my $self = {};
71 bless($self,$class);
72
73 return $self;
74 }
75
76
77 ############################################################
78
79 sub clip{
80 my ($self, $bioseq) = @_;
81
82 # print STDERR "past a $bioseq\n";
83 my $seq = $bioseq->seq;
84 $self->_clip(1);
85 $self->_mask(0);
86 $self->_softmask(0);
87 my $new_seq = $self->_find_polyA($seq);
88 my $cdna = Bio::Seq->new();
89 if (length($new_seq) > 0){
90 $cdna->seq($new_seq);
91 }
92 else {
93 print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n";
94 print "Returning undef\n";
95 return undef;
96 }
97 $cdna->display_id( $bioseq->display_id );
98 $cdna->desc( $bioseq->desc );
99
100 return $cdna;
101 }
102
103 ############################################################
104
105 sub mask{
106 my ($self, $bioseq, $flag ) = @_;
107
108 my $seq = $bioseq->seq;
109 $self->_clip(0);
110 if ( $flag ){
111 $self->_mask(0);
112 $self->_softmask(1);
113 }
114 else{
115 $self->_mask(1);
116 $self->_softmask(0);
117 }
118 my $new_seq = $self->_find_polyA($seq);
119 my $cdna = new Bio::Seq;
120 $cdna->seq($new_seq);
121 $cdna->display_id( $bioseq->display_id );
122 $cdna->desc( $bioseq->desc );
123
124 return $cdna;
125 }
126
127 ############################################################
128
129
130
131
132 ############################################################
133
134 sub _find_polyA{
135 my ($self, $seq) = @_;
136 my $new_seq;
137 my $length = length($seq);
138
139 # is it a polyA or polyT?
140 my $check_polyT = substr( $seq, 0, 6 );
141
142 my $check_polyA = substr( $seq, -6 );
143
144 my $t_count = $check_polyT =~ tr/Tt//;
145 my $a_count = $check_polyA =~ tr/Aa//;
146
147 #### polyA ####
148 if ( $a_count >= 5 && $a_count > $t_count ){
149
150 # we calculate the number of bases we want to chop
151 my $length_to_mask = 0;
152
153 # we start with 3 bases
154 my ($piece, $count ) = (3,0);
155
156 # count also the number of Ns, consider the Ns as potential As
157 my $n_count = 0;
158
159 # take 3 by 3 bases from the end
160 while( $length_to_mask < $length ){
161 my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece);
162 $count = $chunk =~ tr/Aa//;
163 $n_count = $chunk =~ tr/Nn//;
164 if ( ($count + $n_count) >= 2*( $piece )/3 ){
165 $length_to_mask += 3;
166 }
167 else{
168 last;
169 }
170 }
171
172 if ( $length_to_mask > 0 ){
173 # do not mask the last base if it is not an A:
174 my $last_base = substr( $seq, ( $length - $length_to_mask ), 1);
175 my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1);
176 if ( !( $last_base eq 'A' || $last_base eq 'a') ){
177 $length_to_mask--;
178 }
179 elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){
180 $length_to_mask++;
181 }
182 my $clipped_seq = substr( $seq, 0, $length - $length_to_mask );
183 my $mask;
184 if ( $self->_clip ){
185 $mask = "";
186 }
187 elsif( $self->_mask ){
188 $mask = "N" x ($length_to_mask);
189 }
190 elsif ( $self->_softmask ){
191 $mask = lc substr( $seq, ( $length - $length_to_mask ) );
192 }
193 $new_seq = $clipped_seq . $mask;
194 }
195 else{
196 $new_seq = $seq;
197 }
198 }
199 #### polyT ####
200 elsif( $t_count >=5 && $t_count > $a_count ){
201
202 # calculate the number of bases to chop
203 my $length_to_mask = -3;
204
205 # we start with 3 bases:
206 my ($piece, $count) = (3,3);
207
208 # count also the number of Ns, consider the Ns as potential As
209 my $n_count = 0;
210
211 # take 3 by 3 bases from the beginning
212 while ( $length_to_mask < $length ){
213 my $chunk = substr( $seq, $length_to_mask + 3, $piece );
214 #print STDERR "length to mask: $length_to_mask\n";
215 #print "chunk: $chunk\n";
216 $count = $chunk =~ tr/Tt//;
217 $n_count = $chunk =~ tr/Nn//;
218 if ( ($count+$n_count) >= 2*( $piece )/3 ){
219 $length_to_mask +=3;
220 }
221 else{
222 last;
223
224 }
225 }
226 if ( $length_to_mask >= 0 ){
227 # do not chop the last base if it is not a A:
228 #print STDERR "clipping sequence $seq\n";
229 my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 );
230 my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 );
231 if ( !( $last_base eq 'T' || $last_base eq 't' ) ){
232 $length_to_mask--;
233 }
234 elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){
235 $length_to_mask++;
236 }
237 my $clipped_seq = substr( $seq, $length_to_mask + 3);
238 my $mask;
239 if ( $self->_clip ){
240 $mask = "";
241 }
242 elsif( $self->_mask ){
243 $mask = "N" x ($length_to_mask+3);
244 }
245 elsif ($self->_softmask){
246 $mask = lc substr( $seq, 0, ($length_to_mask + 3) );
247 }
248 $new_seq = $mask.$clipped_seq;
249 }
250 else{
251 $new_seq = $seq;
252 }
253 }
254 else{
255 # we cannot be sure of what it is
256 # do not clip
257 $new_seq = $seq;
258 }
259
260 return $new_seq;
261 }
262
263 ############################################################
264
265 sub _mask{
266 my ($self,$mask) = @_;
267 if (defined($mask)){
268 $self->{_mask} = $mask;
269 }
270 return $self->{_mask};
271 }
272
273 ############################################################
274
275 sub _clip{
276 my ($self,$clip) = @_;
277 if (defined($clip)){
278 $self->{_clip} = $clip;
279 }
280 return $self->{_clip};
281 }
282
283 ############################################################
284
285 sub _softmask{
286 my ($self,$softmask) = @_;
287 if (defined($softmask)){
288 $self->{_softmask} = $softmask;
289 }
290 return $self->{_softmask};
291 }
292
293 ############################################################
294
295
296 sub has_polyA_track{
297 my ($self, $seq) = @_;
298 my $new_seq;
299 my $length = length($seq);
300
301 # is it a polyA or polyT?
302 my $check_polyT = substr( $seq, 0, 10 );
303
304 my $check_polyA = substr( $seq, -10 );
305
306 print STDERR "polyA: $check_polyA\n";
307
308 my $t_count = $check_polyT =~ tr/Tt//;
309 my $a_count = $check_polyA =~ tr/Aa//;
310
311 ## testing with this short cut
312 if ( $a_count >=7 || $t_count >=7 ){
313 return 1;
314 }
315 else{
316 return 0;
317 }
318 }
319
320
321 ################
322 1;