Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/EnsEMBL/Utils/PolyA.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/PolyA.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,322 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::PolyA + +=head1 SYNOPSIS + + my $seq; # a Bio::Seq object + my $polyA = Bio::EnsEMBL::Utils::PolyA->new(); + + # returns a new Bio::Seq object with the trimmed sequence + my $trimmed_seq = $polyA->clip($seq); + + # cat put Ns in the place of the polyA/polyT tail + my $masked_seq = $polyA->mask($seq); + + # can put in lower case the polyA/polyT using any flag: + my $softmasked_seq = $poly->mask( $seq, 'soft' ); + +=head1 DESCRIPTION + + It reads a Bio::Seq object, it first finds out whether it has a + polyA or a polyT and then performs one operation in the seq string: + clipping, masking or softmasking. It then returns a new Bio::Seq + object with the new sequence. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::PolyA; + +use Bio::EnsEMBL::Root; +use Bio::Seq; +use vars qw(@ISA); + +use strict; + +@ISA = qw(Bio::EnsEMBL::Root); + + +=head2 new + +=cut + +sub new{ + my ($class) = @_; + if (ref($class)){ + $class = ref($class); + } + my $self = {}; + bless($self,$class); + + return $self; +} + + +############################################################ + +sub clip{ + my ($self, $bioseq) = @_; + + # print STDERR "past a $bioseq\n"; + my $seq = $bioseq->seq; + $self->_clip(1); + $self->_mask(0); + $self->_softmask(0); + my $new_seq = $self->_find_polyA($seq); + my $cdna = Bio::Seq->new(); + if (length($new_seq) > 0){ + $cdna->seq($new_seq); + } + else { + print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n"; + print "Returning undef\n"; + return undef; + } + $cdna->display_id( $bioseq->display_id ); + $cdna->desc( $bioseq->desc ); + + return $cdna; +} + +############################################################ + +sub mask{ + my ($self, $bioseq, $flag ) = @_; + + my $seq = $bioseq->seq; + $self->_clip(0); + if ( $flag ){ + $self->_mask(0); + $self->_softmask(1); + } + else{ + $self->_mask(1); + $self->_softmask(0); + } + my $new_seq = $self->_find_polyA($seq); + my $cdna = new Bio::Seq; + $cdna->seq($new_seq); + $cdna->display_id( $bioseq->display_id ); + $cdna->desc( $bioseq->desc ); + + return $cdna; +} + +############################################################ + + + + +############################################################ + +sub _find_polyA{ + my ($self, $seq) = @_; + my $new_seq; + my $length = length($seq); + + # is it a polyA or polyT? + my $check_polyT = substr( $seq, 0, 6 ); + + my $check_polyA = substr( $seq, -6 ); + + my $t_count = $check_polyT =~ tr/Tt//; + my $a_count = $check_polyA =~ tr/Aa//; + + #### polyA #### + if ( $a_count >= 5 && $a_count > $t_count ){ + + # we calculate the number of bases we want to chop + my $length_to_mask = 0; + + # we start with 3 bases + my ($piece, $count ) = (3,0); + + # count also the number of Ns, consider the Ns as potential As + my $n_count = 0; + + # take 3 by 3 bases from the end + while( $length_to_mask < $length ){ + my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece); + $count = $chunk =~ tr/Aa//; + $n_count = $chunk =~ tr/Nn//; + if ( ($count + $n_count) >= 2*( $piece )/3 ){ + $length_to_mask += 3; + } + else{ + last; + } + } + + if ( $length_to_mask > 0 ){ + # do not mask the last base if it is not an A: + my $last_base = substr( $seq, ( $length - $length_to_mask ), 1); + my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1); + if ( !( $last_base eq 'A' || $last_base eq 'a') ){ + $length_to_mask--; + } + elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){ + $length_to_mask++; + } + my $clipped_seq = substr( $seq, 0, $length - $length_to_mask ); + my $mask; + if ( $self->_clip ){ + $mask = ""; + } + elsif( $self->_mask ){ + $mask = "N" x ($length_to_mask); + } + elsif ( $self->_softmask ){ + $mask = lc substr( $seq, ( $length - $length_to_mask ) ); + } + $new_seq = $clipped_seq . $mask; + } + else{ + $new_seq = $seq; + } + } + #### polyT #### + elsif( $t_count >=5 && $t_count > $a_count ){ + + # calculate the number of bases to chop + my $length_to_mask = -3; + + # we start with 3 bases: + my ($piece, $count) = (3,3); + + # count also the number of Ns, consider the Ns as potential As + my $n_count = 0; + + # take 3 by 3 bases from the beginning + while ( $length_to_mask < $length ){ + my $chunk = substr( $seq, $length_to_mask + 3, $piece ); + #print STDERR "length to mask: $length_to_mask\n"; + #print "chunk: $chunk\n"; + $count = $chunk =~ tr/Tt//; + $n_count = $chunk =~ tr/Nn//; + if ( ($count+$n_count) >= 2*( $piece )/3 ){ + $length_to_mask +=3; + } + else{ + last; + + } + } + if ( $length_to_mask >= 0 ){ + # do not chop the last base if it is not a A: + #print STDERR "clipping sequence $seq\n"; + my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 ); + my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 ); + if ( !( $last_base eq 'T' || $last_base eq 't' ) ){ + $length_to_mask--; + } + elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){ + $length_to_mask++; + } + my $clipped_seq = substr( $seq, $length_to_mask + 3); + my $mask; + if ( $self->_clip ){ + $mask = ""; + } + elsif( $self->_mask ){ + $mask = "N" x ($length_to_mask+3); + } + elsif ($self->_softmask){ + $mask = lc substr( $seq, 0, ($length_to_mask + 3) ); + } + $new_seq = $mask.$clipped_seq; + } + else{ + $new_seq = $seq; + } + } + else{ + # we cannot be sure of what it is + # do not clip + $new_seq = $seq; + } + + return $new_seq; +} + +############################################################ + +sub _mask{ + my ($self,$mask) = @_; + if (defined($mask)){ + $self->{_mask} = $mask; + } + return $self->{_mask}; +} + +############################################################ + +sub _clip{ + my ($self,$clip) = @_; + if (defined($clip)){ + $self->{_clip} = $clip; + } + return $self->{_clip}; +} + +############################################################ + +sub _softmask{ + my ($self,$softmask) = @_; + if (defined($softmask)){ + $self->{_softmask} = $softmask; + } + return $self->{_softmask}; +} + +############################################################ + + +sub has_polyA_track{ + my ($self, $seq) = @_; + my $new_seq; + my $length = length($seq); + + # is it a polyA or polyT? + my $check_polyT = substr( $seq, 0, 10 ); + + my $check_polyA = substr( $seq, -10 ); + + print STDERR "polyA: $check_polyA\n"; + + my $t_count = $check_polyT =~ tr/Tt//; + my $a_count = $check_polyA =~ tr/Aa//; + + ## testing with this short cut + if ( $a_count >=7 || $t_count >=7 ){ + return 1; + } + else{ + return 0; + } +} + + +################ +1;