Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/RepeatMaskedSlice.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/RepeatMaskedSlice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,308 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::RepeatMaskedSlice - Arbitary Slice of a genome + +=head1 SYNOPSIS + + $sa = $db->get_SliceAdaptor(); + + $slice = + $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); + + $repeat_masked_slice = $slice->get_repeatmasked_seq(); + + # get repeat masked sequence: + my $dna = $repeat_masked_slice->seq(); + $dna = $repeat_masked_slice->subseq( 1, 1000 ); + +=head1 DESCRIPTION + +This is a specialised Bio::EnsEMBL::Slice class that is used to retrieve +repeat masked genomic sequence rather than normal genomic sequence. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::RepeatMaskedSlice; + +use strict; +use warnings; + +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Utils::Exception; + +use vars qw(@ISA); + +@ISA = ('Bio::EnsEMBL::Slice'); + +# The BLOCK_PWR is the lob_bin of the chunksize where you want your repeat features +# to be retreived. This will create repeat feature retrieval calls that are likely +# to be on the same slice and hopefully create cache hits and less database traffic +my $BLOCK_PWR = 18; + + + +=head2 new + + Arg [-REPEAT_MASK] : The logic name of the repeats to be used for masking. + If not provided, all repeats in the database are used. + Arg [...] : Named superclass arguments. See B<Bio::EnsEMBL::Slice>. + Example : my $slice = Bio::EnsEMBL::RepeatMaskedSlice->new + (-START => $start, + -END => $end, + -STRAND => $strand, + -SEQ_REGION_NAME => $seq_region, + -SEQ_REGION_LENGTH => $seq_region_length, + -COORD_SYSTEM => $cs, + -ADAPTOR => $adaptor, + -REPEAT_MASK => ['repeat_masker'], + -SOFT_MASK => 1, + -NOT_DEFAULT_MASKING_CASES => {"repeat_class_SINE/MIR" => 1, + "repeat_name_AluSp" => 0}); + Description: Creates a Slice which behaves exactly as a normal slice but + that returns repeat masked sequence from the seq method. + Returntype : Bio::EnsEMBL::RepeatMaskedSlice + Exceptions : none + Caller : RawComputes (PredictionTranscript creation code). + Status : Stable + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($logic_names, $soft_mask, $not_default_masking_cases) = rearrange(['REPEAT_MASK', + 'SOFT_MASK', + 'NOT_DEFAULT_MASKING_CASES'], @_); + + my $self = $class->SUPER::new(@_); + + + $logic_names ||= ['']; + if(ref($logic_names) ne 'ARRAY') { + throw("Reference to list of logic names argument expected."); + } + + $self->{'repeat_mask_logic_names'} = $logic_names; + $self->{'soft_mask'} = $soft_mask; + $self->{'not_default_masking_cases'} = $not_default_masking_cases; + $self->{'not_default_masking_cases'} ||= {}; + + return $self; +} + + +=head2 repeat_mask_logic_names + + Arg [1] : reference to list of strings $logic_names (optional) + Example : $rm_slice->repeat_mask_logic_name(['repeat_masker']); + Description: Getter/Setter for the logic_names of the repeats that are used + to mask this slices sequence. + Returntype : reference to list of strings + Exceptions : none + Caller : seq() method + Status : Stable + +=cut + +sub repeat_mask_logic_names { + my $self = shift; + + if(@_) { + my $array = shift; + if(ref($array) ne 'ARRAY') { + throw('Reference to list of logic names argument expected.'); + } + } + + return $self->{'repeat_mask_logic_names'}; +} + + +=head2 soft_mask + + Arg [1] : boolean $soft_mask (optional) + Example : $rm_slice->soft_mask(0); + Description: Getter/Setter which is used to turn on/off softmasking of the + sequence returned by seq. + Returntype : boolean + Exceptions : none + Caller : seq() method + Status : Stable + +=cut + +sub soft_mask { + my $self = shift; + $self->{'soft_mask'} = shift if(@_); + return $self->{'soft_mask'} || 0; +} + +=head2 not_default_masking_cases + + Arg [1] : hash reference $not_default_masking_cases (optional, default is {}) + The values are 0 or 1 for hard and soft masking respectively + The keys of the hash should be of 2 forms + "repeat_class_" . $repeat_consensus->repeat_class, + e.g. "repeat_class_SINE/MIR" + "repeat_name_" . $repeat_consensus->name + e.g. "repeat_name_MIR" + depending on which base you want to apply the not default masking either + the repeat_class or repeat_name. Both can be specified in the same hash + at the same time, but in that case, repeat_name setting has priority over + repeat_class. For example, you may have hard masking as default, and + you may want soft masking of all repeat_class SINE/MIR, + but repeat_name AluSp (which are also from repeat_class SINE/MIR) + Example : $rm_slice->not_default_masking_cases({"repeat_class_SINE/MIR" => 1, + "repeat_name_AluSp" => 0}); + Description: Getter/Setter which is used to escape some repeat class or name from the default + masking in place. + Returntype : hash reference + Exceptions : none + Caller : seq() and subseq() methods + Status : Stable + +=cut + +sub not_default_masking_cases { + my $self = shift; + $self->{'not_default_masking_cases'} = shift if (@_); + return $self->{'not_default_masking_cases'}; +} + +=head2 seq + + Arg [1] : none + Example : print $rmslice->seq(), "\n"; + Description: Retrieves the entire repeat masked sequence for this slice. + See also the B<Bio::EnsEMBL::Slice> implementation of this + method. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq { + my $self = shift; + + # + # get all the features + # + my $repeats = $self->_get_repeat_features($self); + my $soft_mask = $self->soft_mask(); + my $not_default_masking_cases = $self->not_default_masking_cases(); + + # + # get the dna + # + my $dna = $self->SUPER::seq(@_); + + # + # mask the dna + # + $self->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases); + return $dna; +} + +=head2 subseq + + Arg [1] : none + Example : print $rmslice->subseq(1, 1000); + Description: Retrieves a repeat masked sequence from a specified subregion + of this slice. See also the B<Bio::EnsEMBL::Slice> + implementation of this method. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub subseq { + my $self = shift; + my $start = shift; + my $end = shift; + my $strand = shift; + + my $subsequence_slice = $self->sub_Slice($start, $end, $strand); + + # If frequent subseqs happen on repeatMasked sequence this results in + # a lot of feature retrieval from the database. To avoid this, features + # are only retrieved from subslices with fixed space boundaries. + # The access happens in block to make cache hits more likely + # ONLY DO IF WE ARE CACHING + + my $subslice; + if(! $self->adaptor()->db()->no_cache()) { + + my $seq_region_slice = $self->seq_region_Slice(); + # The blocksize can be defined on the top of this module. + my $block_min = ($subsequence_slice->start()-1) >> $BLOCK_PWR; + my $block_max = ($subsequence_slice->end()-1) >> $BLOCK_PWR; + + my $sub_start = ($block_min << $BLOCK_PWR)+1; + my $sub_end = ($block_max+1)<<$BLOCK_PWR; + if ($sub_end > $seq_region_slice->length) { + $sub_end = $seq_region_slice->length ; + } + $subslice = $seq_region_slice->sub_Slice($sub_start, $sub_end); + } + else { + $subslice = $subsequence_slice; + } + + my $repeats = $self->_get_repeat_features($subslice); + my $soft_mask = $self->soft_mask(); + my $not_default_masking_cases = $self->not_default_masking_cases(); + my $dna = $subsequence_slice->SUPER::seq(); + $subsequence_slice->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases); + return $dna; +} + +=head2 _get_repeat_features + + Args [1] : Bio::EnsEMBL::Slice to fetch features for + Description : Gets repeat features for the given slice + Returntype : ArrayRef[Bio::EnsEMBL::RepeatFeature] array of repeats + +=cut + + + +sub _get_repeat_features { + my ($self, $slice) = @_; + my $logic_names = $self->repeat_mask_logic_names(); + my @repeats; + foreach my $l (@$logic_names) { + push @repeats, @{$slice->get_all_RepeatFeatures($l)}; + } + return \@repeats; +} + +1;