annotate variant_effect_predictor/Bio/EnsEMBL/Utils/PolyA.pm @ 1:d6778b5d8382 draft default tip

Deleted selected files
author willmclaren
date Fri, 03 Aug 2012 10:05:43 -0400
parents 21066c0abaf5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1 =head1 LICENSE
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4 Genome Research Limited. All rights reserved.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6 This software is distributed under a modified Apache license.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7 For license details, please see
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 http://www.ensembl.org/info/about/code_licence.html
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11 =head1 CONTACT
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13 Please email comments or questions to the public Ensembl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14 developers list at <dev@ensembl.org>.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16 Questions may also be sent to the Ensembl help desk at
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17 <helpdesk@ensembl.org>.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 =head1 NAME
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 Bio::EnsEMBL::Utils::PolyA
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25 =head1 SYNOPSIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27 my $seq; # a Bio::Seq object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28 my $polyA = Bio::EnsEMBL::Utils::PolyA->new();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30 # returns a new Bio::Seq object with the trimmed sequence
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31 my $trimmed_seq = $polyA->clip($seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33 # cat put Ns in the place of the polyA/polyT tail
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34 my $masked_seq = $polyA->mask($seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36 # can put in lower case the polyA/polyT using any flag:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37 my $softmasked_seq = $poly->mask( $seq, 'soft' );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 =head1 DESCRIPTION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41 It reads a Bio::Seq object, it first finds out whether it has a
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42 polyA or a polyT and then performs one operation in the seq string:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43 clipping, masking or softmasking. It then returns a new Bio::Seq
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44 object with the new sequence.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46 =head1 METHODS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 package Bio::EnsEMBL::Utils::PolyA;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52 use Bio::EnsEMBL::Root;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53 use Bio::Seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54 use vars qw(@ISA);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58 @ISA = qw(Bio::EnsEMBL::Root);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61 =head2 new
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65 sub new{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66 my ($class) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67 if (ref($class)){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68 $class = ref($class);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70 my $self = {};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71 bless($self,$class);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73 return $self;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77 ############################################################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79 sub clip{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80 my ($self, $bioseq) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82 # print STDERR "past a $bioseq\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83 my $seq = $bioseq->seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84 $self->_clip(1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85 $self->_mask(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 $self->_softmask(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 my $new_seq = $self->_find_polyA($seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88 my $cdna = Bio::Seq->new();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89 if (length($new_seq) > 0){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90 $cdna->seq($new_seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93 print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94 print "Returning undef\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95 return undef;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97 $cdna->display_id( $bioseq->display_id );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98 $cdna->desc( $bioseq->desc );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100 return $cdna;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103 ############################################################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105 sub mask{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106 my ($self, $bioseq, $flag ) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108 my $seq = $bioseq->seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109 $self->_clip(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110 if ( $flag ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111 $self->_mask(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112 $self->_softmask(1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114 else{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115 $self->_mask(1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116 $self->_softmask(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118 my $new_seq = $self->_find_polyA($seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119 my $cdna = new Bio::Seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120 $cdna->seq($new_seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121 $cdna->display_id( $bioseq->display_id );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122 $cdna->desc( $bioseq->desc );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124 return $cdna;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127 ############################################################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132 ############################################################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134 sub _find_polyA{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135 my ($self, $seq) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136 my $new_seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137 my $length = length($seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139 # is it a polyA or polyT?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140 my $check_polyT = substr( $seq, 0, 6 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142 my $check_polyA = substr( $seq, -6 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144 my $t_count = $check_polyT =~ tr/Tt//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145 my $a_count = $check_polyA =~ tr/Aa//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 #### polyA ####
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148 if ( $a_count >= 5 && $a_count > $t_count ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150 # we calculate the number of bases we want to chop
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151 my $length_to_mask = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153 # we start with 3 bases
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154 my ($piece, $count ) = (3,0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156 # count also the number of Ns, consider the Ns as potential As
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157 my $n_count = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159 # take 3 by 3 bases from the end
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
160 while( $length_to_mask < $length ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
161 my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
162 $count = $chunk =~ tr/Aa//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
163 $n_count = $chunk =~ tr/Nn//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
164 if ( ($count + $n_count) >= 2*( $piece )/3 ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
165 $length_to_mask += 3;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
166 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
167 else{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
168 last;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
169 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
170 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
171
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
172 if ( $length_to_mask > 0 ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
173 # do not mask the last base if it is not an A:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
174 my $last_base = substr( $seq, ( $length - $length_to_mask ), 1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
175 my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
176 if ( !( $last_base eq 'A' || $last_base eq 'a') ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
177 $length_to_mask--;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
178 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
179 elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
180 $length_to_mask++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
181 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
182 my $clipped_seq = substr( $seq, 0, $length - $length_to_mask );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
183 my $mask;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
184 if ( $self->_clip ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
185 $mask = "";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
186 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
187 elsif( $self->_mask ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
188 $mask = "N" x ($length_to_mask);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
189 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
190 elsif ( $self->_softmask ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
191 $mask = lc substr( $seq, ( $length - $length_to_mask ) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
192 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
193 $new_seq = $clipped_seq . $mask;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
194 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
195 else{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
196 $new_seq = $seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
197 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
198 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
199 #### polyT ####
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
200 elsif( $t_count >=5 && $t_count > $a_count ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
201
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
202 # calculate the number of bases to chop
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
203 my $length_to_mask = -3;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
204
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
205 # we start with 3 bases:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
206 my ($piece, $count) = (3,3);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
207
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
208 # count also the number of Ns, consider the Ns as potential As
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
209 my $n_count = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
210
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
211 # take 3 by 3 bases from the beginning
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
212 while ( $length_to_mask < $length ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
213 my $chunk = substr( $seq, $length_to_mask + 3, $piece );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
214 #print STDERR "length to mask: $length_to_mask\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
215 #print "chunk: $chunk\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
216 $count = $chunk =~ tr/Tt//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
217 $n_count = $chunk =~ tr/Nn//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
218 if ( ($count+$n_count) >= 2*( $piece )/3 ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
219 $length_to_mask +=3;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
220 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
221 else{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
222 last;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
223
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
224 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
225 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
226 if ( $length_to_mask >= 0 ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
227 # do not chop the last base if it is not a A:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
228 #print STDERR "clipping sequence $seq\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
229 my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
230 my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
231 if ( !( $last_base eq 'T' || $last_base eq 't' ) ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
232 $length_to_mask--;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
233 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
234 elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
235 $length_to_mask++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
236 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
237 my $clipped_seq = substr( $seq, $length_to_mask + 3);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
238 my $mask;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
239 if ( $self->_clip ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
240 $mask = "";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
241 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
242 elsif( $self->_mask ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
243 $mask = "N" x ($length_to_mask+3);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
244 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
245 elsif ($self->_softmask){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
246 $mask = lc substr( $seq, 0, ($length_to_mask + 3) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
247 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
248 $new_seq = $mask.$clipped_seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
249 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
250 else{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
251 $new_seq = $seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
252 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
253 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
254 else{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
255 # we cannot be sure of what it is
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
256 # do not clip
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
257 $new_seq = $seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
258 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
259
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
260 return $new_seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
261 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
262
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
263 ############################################################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
264
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
265 sub _mask{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
266 my ($self,$mask) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
267 if (defined($mask)){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
268 $self->{_mask} = $mask;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
269 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
270 return $self->{_mask};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
271 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
272
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
273 ############################################################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
274
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
275 sub _clip{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
276 my ($self,$clip) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
277 if (defined($clip)){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
278 $self->{_clip} = $clip;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
279 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
280 return $self->{_clip};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
281 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
282
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
283 ############################################################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
284
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
285 sub _softmask{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
286 my ($self,$softmask) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
287 if (defined($softmask)){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
288 $self->{_softmask} = $softmask;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
289 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
290 return $self->{_softmask};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
291 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
292
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
293 ############################################################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
294
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
295
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
296 sub has_polyA_track{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
297 my ($self, $seq) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
298 my $new_seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
299 my $length = length($seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
300
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
301 # is it a polyA or polyT?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
302 my $check_polyT = substr( $seq, 0, 10 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
303
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
304 my $check_polyA = substr( $seq, -10 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
305
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
306 print STDERR "polyA: $check_polyA\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
307
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
308 my $t_count = $check_polyT =~ tr/Tt//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
309 my $a_count = $check_polyA =~ tr/Aa//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
310
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
311 ## testing with this short cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
312 if ( $a_count >=7 || $t_count >=7 ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
313 return 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
314 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
315 else{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
316 return 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
317 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
318 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
319
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
320
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
321 ################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
322 1;