Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/EnsEMBL/Utils/PolyA.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Utils::PolyA | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 my $seq; # a Bio::Seq object | |
28 my $polyA = Bio::EnsEMBL::Utils::PolyA->new(); | |
29 | |
30 # returns a new Bio::Seq object with the trimmed sequence | |
31 my $trimmed_seq = $polyA->clip($seq); | |
32 | |
33 # cat put Ns in the place of the polyA/polyT tail | |
34 my $masked_seq = $polyA->mask($seq); | |
35 | |
36 # can put in lower case the polyA/polyT using any flag: | |
37 my $softmasked_seq = $poly->mask( $seq, 'soft' ); | |
38 | |
39 =head1 DESCRIPTION | |
40 | |
41 It reads a Bio::Seq object, it first finds out whether it has a | |
42 polyA or a polyT and then performs one operation in the seq string: | |
43 clipping, masking or softmasking. It then returns a new Bio::Seq | |
44 object with the new sequence. | |
45 | |
46 =head1 METHODS | |
47 | |
48 =cut | |
49 | |
50 package Bio::EnsEMBL::Utils::PolyA; | |
51 | |
52 use Bio::EnsEMBL::Root; | |
53 use Bio::Seq; | |
54 use vars qw(@ISA); | |
55 | |
56 use strict; | |
57 | |
58 @ISA = qw(Bio::EnsEMBL::Root); | |
59 | |
60 | |
61 =head2 new | |
62 | |
63 =cut | |
64 | |
65 sub new{ | |
66 my ($class) = @_; | |
67 if (ref($class)){ | |
68 $class = ref($class); | |
69 } | |
70 my $self = {}; | |
71 bless($self,$class); | |
72 | |
73 return $self; | |
74 } | |
75 | |
76 | |
77 ############################################################ | |
78 | |
79 sub clip{ | |
80 my ($self, $bioseq) = @_; | |
81 | |
82 # print STDERR "past a $bioseq\n"; | |
83 my $seq = $bioseq->seq; | |
84 $self->_clip(1); | |
85 $self->_mask(0); | |
86 $self->_softmask(0); | |
87 my $new_seq = $self->_find_polyA($seq); | |
88 my $cdna = Bio::Seq->new(); | |
89 if (length($new_seq) > 0){ | |
90 $cdna->seq($new_seq); | |
91 } | |
92 else { | |
93 print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n"; | |
94 print "Returning undef\n"; | |
95 return undef; | |
96 } | |
97 $cdna->display_id( $bioseq->display_id ); | |
98 $cdna->desc( $bioseq->desc ); | |
99 | |
100 return $cdna; | |
101 } | |
102 | |
103 ############################################################ | |
104 | |
105 sub mask{ | |
106 my ($self, $bioseq, $flag ) = @_; | |
107 | |
108 my $seq = $bioseq->seq; | |
109 $self->_clip(0); | |
110 if ( $flag ){ | |
111 $self->_mask(0); | |
112 $self->_softmask(1); | |
113 } | |
114 else{ | |
115 $self->_mask(1); | |
116 $self->_softmask(0); | |
117 } | |
118 my $new_seq = $self->_find_polyA($seq); | |
119 my $cdna = new Bio::Seq; | |
120 $cdna->seq($new_seq); | |
121 $cdna->display_id( $bioseq->display_id ); | |
122 $cdna->desc( $bioseq->desc ); | |
123 | |
124 return $cdna; | |
125 } | |
126 | |
127 ############################################################ | |
128 | |
129 | |
130 | |
131 | |
132 ############################################################ | |
133 | |
134 sub _find_polyA{ | |
135 my ($self, $seq) = @_; | |
136 my $new_seq; | |
137 my $length = length($seq); | |
138 | |
139 # is it a polyA or polyT? | |
140 my $check_polyT = substr( $seq, 0, 6 ); | |
141 | |
142 my $check_polyA = substr( $seq, -6 ); | |
143 | |
144 my $t_count = $check_polyT =~ tr/Tt//; | |
145 my $a_count = $check_polyA =~ tr/Aa//; | |
146 | |
147 #### polyA #### | |
148 if ( $a_count >= 5 && $a_count > $t_count ){ | |
149 | |
150 # we calculate the number of bases we want to chop | |
151 my $length_to_mask = 0; | |
152 | |
153 # we start with 3 bases | |
154 my ($piece, $count ) = (3,0); | |
155 | |
156 # count also the number of Ns, consider the Ns as potential As | |
157 my $n_count = 0; | |
158 | |
159 # take 3 by 3 bases from the end | |
160 while( $length_to_mask < $length ){ | |
161 my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece); | |
162 $count = $chunk =~ tr/Aa//; | |
163 $n_count = $chunk =~ tr/Nn//; | |
164 if ( ($count + $n_count) >= 2*( $piece )/3 ){ | |
165 $length_to_mask += 3; | |
166 } | |
167 else{ | |
168 last; | |
169 } | |
170 } | |
171 | |
172 if ( $length_to_mask > 0 ){ | |
173 # do not mask the last base if it is not an A: | |
174 my $last_base = substr( $seq, ( $length - $length_to_mask ), 1); | |
175 my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1); | |
176 if ( !( $last_base eq 'A' || $last_base eq 'a') ){ | |
177 $length_to_mask--; | |
178 } | |
179 elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){ | |
180 $length_to_mask++; | |
181 } | |
182 my $clipped_seq = substr( $seq, 0, $length - $length_to_mask ); | |
183 my $mask; | |
184 if ( $self->_clip ){ | |
185 $mask = ""; | |
186 } | |
187 elsif( $self->_mask ){ | |
188 $mask = "N" x ($length_to_mask); | |
189 } | |
190 elsif ( $self->_softmask ){ | |
191 $mask = lc substr( $seq, ( $length - $length_to_mask ) ); | |
192 } | |
193 $new_seq = $clipped_seq . $mask; | |
194 } | |
195 else{ | |
196 $new_seq = $seq; | |
197 } | |
198 } | |
199 #### polyT #### | |
200 elsif( $t_count >=5 && $t_count > $a_count ){ | |
201 | |
202 # calculate the number of bases to chop | |
203 my $length_to_mask = -3; | |
204 | |
205 # we start with 3 bases: | |
206 my ($piece, $count) = (3,3); | |
207 | |
208 # count also the number of Ns, consider the Ns as potential As | |
209 my $n_count = 0; | |
210 | |
211 # take 3 by 3 bases from the beginning | |
212 while ( $length_to_mask < $length ){ | |
213 my $chunk = substr( $seq, $length_to_mask + 3, $piece ); | |
214 #print STDERR "length to mask: $length_to_mask\n"; | |
215 #print "chunk: $chunk\n"; | |
216 $count = $chunk =~ tr/Tt//; | |
217 $n_count = $chunk =~ tr/Nn//; | |
218 if ( ($count+$n_count) >= 2*( $piece )/3 ){ | |
219 $length_to_mask +=3; | |
220 } | |
221 else{ | |
222 last; | |
223 | |
224 } | |
225 } | |
226 if ( $length_to_mask >= 0 ){ | |
227 # do not chop the last base if it is not a A: | |
228 #print STDERR "clipping sequence $seq\n"; | |
229 my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 ); | |
230 my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 ); | |
231 if ( !( $last_base eq 'T' || $last_base eq 't' ) ){ | |
232 $length_to_mask--; | |
233 } | |
234 elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){ | |
235 $length_to_mask++; | |
236 } | |
237 my $clipped_seq = substr( $seq, $length_to_mask + 3); | |
238 my $mask; | |
239 if ( $self->_clip ){ | |
240 $mask = ""; | |
241 } | |
242 elsif( $self->_mask ){ | |
243 $mask = "N" x ($length_to_mask+3); | |
244 } | |
245 elsif ($self->_softmask){ | |
246 $mask = lc substr( $seq, 0, ($length_to_mask + 3) ); | |
247 } | |
248 $new_seq = $mask.$clipped_seq; | |
249 } | |
250 else{ | |
251 $new_seq = $seq; | |
252 } | |
253 } | |
254 else{ | |
255 # we cannot be sure of what it is | |
256 # do not clip | |
257 $new_seq = $seq; | |
258 } | |
259 | |
260 return $new_seq; | |
261 } | |
262 | |
263 ############################################################ | |
264 | |
265 sub _mask{ | |
266 my ($self,$mask) = @_; | |
267 if (defined($mask)){ | |
268 $self->{_mask} = $mask; | |
269 } | |
270 return $self->{_mask}; | |
271 } | |
272 | |
273 ############################################################ | |
274 | |
275 sub _clip{ | |
276 my ($self,$clip) = @_; | |
277 if (defined($clip)){ | |
278 $self->{_clip} = $clip; | |
279 } | |
280 return $self->{_clip}; | |
281 } | |
282 | |
283 ############################################################ | |
284 | |
285 sub _softmask{ | |
286 my ($self,$softmask) = @_; | |
287 if (defined($softmask)){ | |
288 $self->{_softmask} = $softmask; | |
289 } | |
290 return $self->{_softmask}; | |
291 } | |
292 | |
293 ############################################################ | |
294 | |
295 | |
296 sub has_polyA_track{ | |
297 my ($self, $seq) = @_; | |
298 my $new_seq; | |
299 my $length = length($seq); | |
300 | |
301 # is it a polyA or polyT? | |
302 my $check_polyT = substr( $seq, 0, 10 ); | |
303 | |
304 my $check_polyA = substr( $seq, -10 ); | |
305 | |
306 print STDERR "polyA: $check_polyA\n"; | |
307 | |
308 my $t_count = $check_polyT =~ tr/Tt//; | |
309 my $a_count = $check_polyA =~ tr/Aa//; | |
310 | |
311 ## testing with this short cut | |
312 if ( $a_count >=7 || $t_count >=7 ){ | |
313 return 1; | |
314 } | |
315 else{ | |
316 return 0; | |
317 } | |
318 } | |
319 | |
320 | |
321 ################ | |
322 1; |