annotate variant_effect_predictor/Bio/EnsEMBL/Utils/PolyA.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 =head1 LICENSE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4 Genome Research Limited. All rights reserved.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6 This software is distributed under a modified Apache license.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7 For license details, please see
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9 http://www.ensembl.org/info/about/code_licence.html
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 =head1 CONTACT
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 Please email comments or questions to the public Ensembl
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14 developers list at <dev@ensembl.org>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16 Questions may also be sent to the Ensembl help desk at
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17 <helpdesk@ensembl.org>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 Bio::EnsEMBL::Utils::PolyA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 =head1 SYNOPSIS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 my $seq; # a Bio::Seq object
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28 my $polyA = Bio::EnsEMBL::Utils::PolyA->new();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30 # returns a new Bio::Seq object with the trimmed sequence
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31 my $trimmed_seq = $polyA->clip($seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33 # cat put Ns in the place of the polyA/polyT tail
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34 my $masked_seq = $polyA->mask($seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36 # can put in lower case the polyA/polyT using any flag:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37 my $softmasked_seq = $poly->mask( $seq, 'soft' );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 It reads a Bio::Seq object, it first finds out whether it has a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42 polyA or a polyT and then performs one operation in the seq string:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43 clipping, masking or softmasking. It then returns a new Bio::Seq
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44 object with the new sequence.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46 =head1 METHODS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50 package Bio::EnsEMBL::Utils::PolyA;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52 use Bio::EnsEMBL::Root;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53 use Bio::Seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54 use vars qw(@ISA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58 @ISA = qw(Bio::EnsEMBL::Root);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61 =head2 new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65 sub new{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66 my ($class) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67 if (ref($class)){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 $class = ref($class);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 my $self = {};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71 bless($self,$class);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73 return $self;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77 ############################################################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 sub clip{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80 my ($self, $bioseq) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82 # print STDERR "past a $bioseq\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 my $seq = $bioseq->seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84 $self->_clip(1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85 $self->_mask(0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86 $self->_softmask(0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87 my $new_seq = $self->_find_polyA($seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88 my $cdna = Bio::Seq->new();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89 if (length($new_seq) > 0){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90 $cdna->seq($new_seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92 else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94 print "Returning undef\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95 return undef;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97 $cdna->display_id( $bioseq->display_id );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98 $cdna->desc( $bioseq->desc );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 return $cdna;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103 ############################################################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105 sub mask{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106 my ($self, $bioseq, $flag ) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 my $seq = $bioseq->seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109 $self->_clip(0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110 if ( $flag ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111 $self->_mask(0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112 $self->_softmask(1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115 $self->_mask(1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116 $self->_softmask(0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118 my $new_seq = $self->_find_polyA($seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119 my $cdna = new Bio::Seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 $cdna->seq($new_seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121 $cdna->display_id( $bioseq->display_id );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122 $cdna->desc( $bioseq->desc );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124 return $cdna;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127 ############################################################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132 ############################################################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134 sub _find_polyA{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135 my ($self, $seq) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136 my $new_seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137 my $length = length($seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139 # is it a polyA or polyT?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140 my $check_polyT = substr( $seq, 0, 6 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142 my $check_polyA = substr( $seq, -6 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144 my $t_count = $check_polyT =~ tr/Tt//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145 my $a_count = $check_polyA =~ tr/Aa//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147 #### polyA ####
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148 if ( $a_count >= 5 && $a_count > $t_count ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150 # we calculate the number of bases we want to chop
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151 my $length_to_mask = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153 # we start with 3 bases
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154 my ($piece, $count ) = (3,0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156 # count also the number of Ns, consider the Ns as potential As
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157 my $n_count = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159 # take 3 by 3 bases from the end
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160 while( $length_to_mask < $length ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
161 my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
162 $count = $chunk =~ tr/Aa//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
163 $n_count = $chunk =~ tr/Nn//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
164 if ( ($count + $n_count) >= 2*( $piece )/3 ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
165 $length_to_mask += 3;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
166 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
167 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
168 last;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
169 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
170 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
171
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
172 if ( $length_to_mask > 0 ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
173 # do not mask the last base if it is not an A:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
174 my $last_base = substr( $seq, ( $length - $length_to_mask ), 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
175 my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
176 if ( !( $last_base eq 'A' || $last_base eq 'a') ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
177 $length_to_mask--;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
178 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
179 elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
180 $length_to_mask++;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
181 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
182 my $clipped_seq = substr( $seq, 0, $length - $length_to_mask );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
183 my $mask;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
184 if ( $self->_clip ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
185 $mask = "";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
186 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
187 elsif( $self->_mask ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
188 $mask = "N" x ($length_to_mask);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
189 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
190 elsif ( $self->_softmask ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
191 $mask = lc substr( $seq, ( $length - $length_to_mask ) );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
192 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
193 $new_seq = $clipped_seq . $mask;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
194 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
195 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
196 $new_seq = $seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
197 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
198 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
199 #### polyT ####
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
200 elsif( $t_count >=5 && $t_count > $a_count ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
201
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
202 # calculate the number of bases to chop
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
203 my $length_to_mask = -3;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
204
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
205 # we start with 3 bases:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
206 my ($piece, $count) = (3,3);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
207
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
208 # count also the number of Ns, consider the Ns as potential As
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
209 my $n_count = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
210
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
211 # take 3 by 3 bases from the beginning
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
212 while ( $length_to_mask < $length ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
213 my $chunk = substr( $seq, $length_to_mask + 3, $piece );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
214 #print STDERR "length to mask: $length_to_mask\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
215 #print "chunk: $chunk\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
216 $count = $chunk =~ tr/Tt//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
217 $n_count = $chunk =~ tr/Nn//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
218 if ( ($count+$n_count) >= 2*( $piece )/3 ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
219 $length_to_mask +=3;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
220 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
221 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
222 last;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
223
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
224 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
225 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
226 if ( $length_to_mask >= 0 ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
227 # do not chop the last base if it is not a A:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
228 #print STDERR "clipping sequence $seq\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
229 my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
230 my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
231 if ( !( $last_base eq 'T' || $last_base eq 't' ) ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
232 $length_to_mask--;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
233 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
234 elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
235 $length_to_mask++;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
236 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
237 my $clipped_seq = substr( $seq, $length_to_mask + 3);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
238 my $mask;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
239 if ( $self->_clip ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
240 $mask = "";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
241 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
242 elsif( $self->_mask ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
243 $mask = "N" x ($length_to_mask+3);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
244 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
245 elsif ($self->_softmask){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
246 $mask = lc substr( $seq, 0, ($length_to_mask + 3) );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
247 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
248 $new_seq = $mask.$clipped_seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
249 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
250 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
251 $new_seq = $seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
252 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
253 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
254 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
255 # we cannot be sure of what it is
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
256 # do not clip
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
257 $new_seq = $seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
258 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
259
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
260 return $new_seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
261 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
262
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
263 ############################################################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
264
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
265 sub _mask{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
266 my ($self,$mask) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
267 if (defined($mask)){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
268 $self->{_mask} = $mask;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
269 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
270 return $self->{_mask};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
271 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
272
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
273 ############################################################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
274
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
275 sub _clip{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
276 my ($self,$clip) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
277 if (defined($clip)){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
278 $self->{_clip} = $clip;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
279 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
280 return $self->{_clip};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
281 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
282
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
283 ############################################################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
284
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
285 sub _softmask{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
286 my ($self,$softmask) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
287 if (defined($softmask)){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
288 $self->{_softmask} = $softmask;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
289 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
290 return $self->{_softmask};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
291 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
292
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
293 ############################################################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
294
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
295
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
296 sub has_polyA_track{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
297 my ($self, $seq) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
298 my $new_seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
299 my $length = length($seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
300
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
301 # is it a polyA or polyT?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
302 my $check_polyT = substr( $seq, 0, 10 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
303
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
304 my $check_polyA = substr( $seq, -10 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
305
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
306 print STDERR "polyA: $check_polyA\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
307
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
308 my $t_count = $check_polyT =~ tr/Tt//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
309 my $a_count = $check_polyA =~ tr/Aa//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
310
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
311 ## testing with this short cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
312 if ( $a_count >=7 || $t_count >=7 ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
313 return 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
314 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
315 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
316 return 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
317 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
318 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
319
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
320
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
321 ################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
322 1;