Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/RepeatMaskedSlice.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 4 Genome Research Limited. All rights reserved. | |
| 5 | |
| 6 This software is distributed under a modified Apache license. | |
| 7 For license details, please see | |
| 8 | |
| 9 http://www.ensembl.org/info/about/code_licence.html | |
| 10 | |
| 11 =head1 CONTACT | |
| 12 | |
| 13 Please email comments or questions to the public Ensembl | |
| 14 developers list at <dev@ensembl.org>. | |
| 15 | |
| 16 Questions may also be sent to the Ensembl help desk at | |
| 17 <helpdesk@ensembl.org>. | |
| 18 | |
| 19 =cut | |
| 20 | |
| 21 =head1 NAME | |
| 22 | |
| 23 Bio::EnsEMBL::RepeatMaskedSlice - Arbitary Slice of a genome | |
| 24 | |
| 25 =head1 SYNOPSIS | |
| 26 | |
| 27 $sa = $db->get_SliceAdaptor(); | |
| 28 | |
| 29 $slice = | |
| 30 $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); | |
| 31 | |
| 32 $repeat_masked_slice = $slice->get_repeatmasked_seq(); | |
| 33 | |
| 34 # get repeat masked sequence: | |
| 35 my $dna = $repeat_masked_slice->seq(); | |
| 36 $dna = $repeat_masked_slice->subseq( 1, 1000 ); | |
| 37 | |
| 38 =head1 DESCRIPTION | |
| 39 | |
| 40 This is a specialised Bio::EnsEMBL::Slice class that is used to retrieve | |
| 41 repeat masked genomic sequence rather than normal genomic sequence. | |
| 42 | |
| 43 =head1 METHODS | |
| 44 | |
| 45 =cut | |
| 46 | |
| 47 package Bio::EnsEMBL::RepeatMaskedSlice; | |
| 48 | |
| 49 use strict; | |
| 50 use warnings; | |
| 51 | |
| 52 use Bio::EnsEMBL::Slice; | |
| 53 use Bio::EnsEMBL::Utils::Argument qw(rearrange); | |
| 54 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
| 55 use Bio::EnsEMBL::Utils::Exception; | |
| 56 | |
| 57 use vars qw(@ISA); | |
| 58 | |
| 59 @ISA = ('Bio::EnsEMBL::Slice'); | |
| 60 | |
| 61 # The BLOCK_PWR is the lob_bin of the chunksize where you want your repeat features | |
| 62 # to be retreived. This will create repeat feature retrieval calls that are likely | |
| 63 # to be on the same slice and hopefully create cache hits and less database traffic | |
| 64 my $BLOCK_PWR = 18; | |
| 65 | |
| 66 | |
| 67 | |
| 68 =head2 new | |
| 69 | |
| 70 Arg [-REPEAT_MASK] : The logic name of the repeats to be used for masking. | |
| 71 If not provided, all repeats in the database are used. | |
| 72 Arg [...] : Named superclass arguments. See B<Bio::EnsEMBL::Slice>. | |
| 73 Example : my $slice = Bio::EnsEMBL::RepeatMaskedSlice->new | |
| 74 (-START => $start, | |
| 75 -END => $end, | |
| 76 -STRAND => $strand, | |
| 77 -SEQ_REGION_NAME => $seq_region, | |
| 78 -SEQ_REGION_LENGTH => $seq_region_length, | |
| 79 -COORD_SYSTEM => $cs, | |
| 80 -ADAPTOR => $adaptor, | |
| 81 -REPEAT_MASK => ['repeat_masker'], | |
| 82 -SOFT_MASK => 1, | |
| 83 -NOT_DEFAULT_MASKING_CASES => {"repeat_class_SINE/MIR" => 1, | |
| 84 "repeat_name_AluSp" => 0}); | |
| 85 Description: Creates a Slice which behaves exactly as a normal slice but | |
| 86 that returns repeat masked sequence from the seq method. | |
| 87 Returntype : Bio::EnsEMBL::RepeatMaskedSlice | |
| 88 Exceptions : none | |
| 89 Caller : RawComputes (PredictionTranscript creation code). | |
| 90 Status : Stable | |
| 91 | |
| 92 =cut | |
| 93 | |
| 94 sub new { | |
| 95 my $caller = shift; | |
| 96 my $class = ref($caller) || $caller; | |
| 97 | |
| 98 my ($logic_names, $soft_mask, $not_default_masking_cases) = rearrange(['REPEAT_MASK', | |
| 99 'SOFT_MASK', | |
| 100 'NOT_DEFAULT_MASKING_CASES'], @_); | |
| 101 | |
| 102 my $self = $class->SUPER::new(@_); | |
| 103 | |
| 104 | |
| 105 $logic_names ||= ['']; | |
| 106 if(ref($logic_names) ne 'ARRAY') { | |
| 107 throw("Reference to list of logic names argument expected."); | |
| 108 } | |
| 109 | |
| 110 $self->{'repeat_mask_logic_names'} = $logic_names; | |
| 111 $self->{'soft_mask'} = $soft_mask; | |
| 112 $self->{'not_default_masking_cases'} = $not_default_masking_cases; | |
| 113 $self->{'not_default_masking_cases'} ||= {}; | |
| 114 | |
| 115 return $self; | |
| 116 } | |
| 117 | |
| 118 | |
| 119 =head2 repeat_mask_logic_names | |
| 120 | |
| 121 Arg [1] : reference to list of strings $logic_names (optional) | |
| 122 Example : $rm_slice->repeat_mask_logic_name(['repeat_masker']); | |
| 123 Description: Getter/Setter for the logic_names of the repeats that are used | |
| 124 to mask this slices sequence. | |
| 125 Returntype : reference to list of strings | |
| 126 Exceptions : none | |
| 127 Caller : seq() method | |
| 128 Status : Stable | |
| 129 | |
| 130 =cut | |
| 131 | |
| 132 sub repeat_mask_logic_names { | |
| 133 my $self = shift; | |
| 134 | |
| 135 if(@_) { | |
| 136 my $array = shift; | |
| 137 if(ref($array) ne 'ARRAY') { | |
| 138 throw('Reference to list of logic names argument expected.'); | |
| 139 } | |
| 140 } | |
| 141 | |
| 142 return $self->{'repeat_mask_logic_names'}; | |
| 143 } | |
| 144 | |
| 145 | |
| 146 =head2 soft_mask | |
| 147 | |
| 148 Arg [1] : boolean $soft_mask (optional) | |
| 149 Example : $rm_slice->soft_mask(0); | |
| 150 Description: Getter/Setter which is used to turn on/off softmasking of the | |
| 151 sequence returned by seq. | |
| 152 Returntype : boolean | |
| 153 Exceptions : none | |
| 154 Caller : seq() method | |
| 155 Status : Stable | |
| 156 | |
| 157 =cut | |
| 158 | |
| 159 sub soft_mask { | |
| 160 my $self = shift; | |
| 161 $self->{'soft_mask'} = shift if(@_); | |
| 162 return $self->{'soft_mask'} || 0; | |
| 163 } | |
| 164 | |
| 165 =head2 not_default_masking_cases | |
| 166 | |
| 167 Arg [1] : hash reference $not_default_masking_cases (optional, default is {}) | |
| 168 The values are 0 or 1 for hard and soft masking respectively | |
| 169 The keys of the hash should be of 2 forms | |
| 170 "repeat_class_" . $repeat_consensus->repeat_class, | |
| 171 e.g. "repeat_class_SINE/MIR" | |
| 172 "repeat_name_" . $repeat_consensus->name | |
| 173 e.g. "repeat_name_MIR" | |
| 174 depending on which base you want to apply the not default masking either | |
| 175 the repeat_class or repeat_name. Both can be specified in the same hash | |
| 176 at the same time, but in that case, repeat_name setting has priority over | |
| 177 repeat_class. For example, you may have hard masking as default, and | |
| 178 you may want soft masking of all repeat_class SINE/MIR, | |
| 179 but repeat_name AluSp (which are also from repeat_class SINE/MIR) | |
| 180 Example : $rm_slice->not_default_masking_cases({"repeat_class_SINE/MIR" => 1, | |
| 181 "repeat_name_AluSp" => 0}); | |
| 182 Description: Getter/Setter which is used to escape some repeat class or name from the default | |
| 183 masking in place. | |
| 184 Returntype : hash reference | |
| 185 Exceptions : none | |
| 186 Caller : seq() and subseq() methods | |
| 187 Status : Stable | |
| 188 | |
| 189 =cut | |
| 190 | |
| 191 sub not_default_masking_cases { | |
| 192 my $self = shift; | |
| 193 $self->{'not_default_masking_cases'} = shift if (@_); | |
| 194 return $self->{'not_default_masking_cases'}; | |
| 195 } | |
| 196 | |
| 197 =head2 seq | |
| 198 | |
| 199 Arg [1] : none | |
| 200 Example : print $rmslice->seq(), "\n"; | |
| 201 Description: Retrieves the entire repeat masked sequence for this slice. | |
| 202 See also the B<Bio::EnsEMBL::Slice> implementation of this | |
| 203 method. | |
| 204 Returntype : string | |
| 205 Exceptions : none | |
| 206 Caller : general | |
| 207 Status : Stable | |
| 208 | |
| 209 =cut | |
| 210 | |
| 211 sub seq { | |
| 212 my $self = shift; | |
| 213 | |
| 214 # | |
| 215 # get all the features | |
| 216 # | |
| 217 my $repeats = $self->_get_repeat_features($self); | |
| 218 my $soft_mask = $self->soft_mask(); | |
| 219 my $not_default_masking_cases = $self->not_default_masking_cases(); | |
| 220 | |
| 221 # | |
| 222 # get the dna | |
| 223 # | |
| 224 my $dna = $self->SUPER::seq(@_); | |
| 225 | |
| 226 # | |
| 227 # mask the dna | |
| 228 # | |
| 229 $self->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases); | |
| 230 return $dna; | |
| 231 } | |
| 232 | |
| 233 =head2 subseq | |
| 234 | |
| 235 Arg [1] : none | |
| 236 Example : print $rmslice->subseq(1, 1000); | |
| 237 Description: Retrieves a repeat masked sequence from a specified subregion | |
| 238 of this slice. See also the B<Bio::EnsEMBL::Slice> | |
| 239 implementation of this method. | |
| 240 Returntype : string | |
| 241 Exceptions : none | |
| 242 Caller : general | |
| 243 Status : Stable | |
| 244 | |
| 245 =cut | |
| 246 | |
| 247 sub subseq { | |
| 248 my $self = shift; | |
| 249 my $start = shift; | |
| 250 my $end = shift; | |
| 251 my $strand = shift; | |
| 252 | |
| 253 my $subsequence_slice = $self->sub_Slice($start, $end, $strand); | |
| 254 | |
| 255 # If frequent subseqs happen on repeatMasked sequence this results in | |
| 256 # a lot of feature retrieval from the database. To avoid this, features | |
| 257 # are only retrieved from subslices with fixed space boundaries. | |
| 258 # The access happens in block to make cache hits more likely | |
| 259 # ONLY DO IF WE ARE CACHING | |
| 260 | |
| 261 my $subslice; | |
| 262 if(! $self->adaptor()->db()->no_cache()) { | |
| 263 | |
| 264 my $seq_region_slice = $self->seq_region_Slice(); | |
| 265 # The blocksize can be defined on the top of this module. | |
| 266 my $block_min = ($subsequence_slice->start()-1) >> $BLOCK_PWR; | |
| 267 my $block_max = ($subsequence_slice->end()-1) >> $BLOCK_PWR; | |
| 268 | |
| 269 my $sub_start = ($block_min << $BLOCK_PWR)+1; | |
| 270 my $sub_end = ($block_max+1)<<$BLOCK_PWR; | |
| 271 if ($sub_end > $seq_region_slice->length) { | |
| 272 $sub_end = $seq_region_slice->length ; | |
| 273 } | |
| 274 $subslice = $seq_region_slice->sub_Slice($sub_start, $sub_end); | |
| 275 } | |
| 276 else { | |
| 277 $subslice = $subsequence_slice; | |
| 278 } | |
| 279 | |
| 280 my $repeats = $self->_get_repeat_features($subslice); | |
| 281 my $soft_mask = $self->soft_mask(); | |
| 282 my $not_default_masking_cases = $self->not_default_masking_cases(); | |
| 283 my $dna = $subsequence_slice->SUPER::seq(); | |
| 284 $subsequence_slice->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases); | |
| 285 return $dna; | |
| 286 } | |
| 287 | |
| 288 =head2 _get_repeat_features | |
| 289 | |
| 290 Args [1] : Bio::EnsEMBL::Slice to fetch features for | |
| 291 Description : Gets repeat features for the given slice | |
| 292 Returntype : ArrayRef[Bio::EnsEMBL::RepeatFeature] array of repeats | |
| 293 | |
| 294 =cut | |
| 295 | |
| 296 | |
| 297 | |
| 298 sub _get_repeat_features { | |
| 299 my ($self, $slice) = @_; | |
| 300 my $logic_names = $self->repeat_mask_logic_names(); | |
| 301 my @repeats; | |
| 302 foreach my $l (@$logic_names) { | |
| 303 push @repeats, @{$slice->get_all_RepeatFeatures($l)}; | |
| 304 } | |
| 305 return \@repeats; | |
| 306 } | |
| 307 | |
| 308 1; |
