comparison variant_effect_predictor/Bio/EnsEMBL/RepeatMaskedSlice.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::RepeatMaskedSlice - Arbitary Slice of a genome
24
25 =head1 SYNOPSIS
26
27 $sa = $db->get_SliceAdaptor();
28
29 $slice =
30 $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 );
31
32 $repeat_masked_slice = $slice->get_repeatmasked_seq();
33
34 # get repeat masked sequence:
35 my $dna = $repeat_masked_slice->seq();
36 $dna = $repeat_masked_slice->subseq( 1, 1000 );
37
38 =head1 DESCRIPTION
39
40 This is a specialised Bio::EnsEMBL::Slice class that is used to retrieve
41 repeat masked genomic sequence rather than normal genomic sequence.
42
43 =head1 METHODS
44
45 =cut
46
47 package Bio::EnsEMBL::RepeatMaskedSlice;
48
49 use strict;
50 use warnings;
51
52 use Bio::EnsEMBL::Slice;
53 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
54 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
55 use Bio::EnsEMBL::Utils::Exception;
56
57 use vars qw(@ISA);
58
59 @ISA = ('Bio::EnsEMBL::Slice');
60
61 # The BLOCK_PWR is the lob_bin of the chunksize where you want your repeat features
62 # to be retreived. This will create repeat feature retrieval calls that are likely
63 # to be on the same slice and hopefully create cache hits and less database traffic
64 my $BLOCK_PWR = 18;
65
66
67
68 =head2 new
69
70 Arg [-REPEAT_MASK] : The logic name of the repeats to be used for masking.
71 If not provided, all repeats in the database are used.
72 Arg [...] : Named superclass arguments. See B<Bio::EnsEMBL::Slice>.
73 Example : my $slice = Bio::EnsEMBL::RepeatMaskedSlice->new
74 (-START => $start,
75 -END => $end,
76 -STRAND => $strand,
77 -SEQ_REGION_NAME => $seq_region,
78 -SEQ_REGION_LENGTH => $seq_region_length,
79 -COORD_SYSTEM => $cs,
80 -ADAPTOR => $adaptor,
81 -REPEAT_MASK => ['repeat_masker'],
82 -SOFT_MASK => 1,
83 -NOT_DEFAULT_MASKING_CASES => {"repeat_class_SINE/MIR" => 1,
84 "repeat_name_AluSp" => 0});
85 Description: Creates a Slice which behaves exactly as a normal slice but
86 that returns repeat masked sequence from the seq method.
87 Returntype : Bio::EnsEMBL::RepeatMaskedSlice
88 Exceptions : none
89 Caller : RawComputes (PredictionTranscript creation code).
90 Status : Stable
91
92 =cut
93
94 sub new {
95 my $caller = shift;
96 my $class = ref($caller) || $caller;
97
98 my ($logic_names, $soft_mask, $not_default_masking_cases) = rearrange(['REPEAT_MASK',
99 'SOFT_MASK',
100 'NOT_DEFAULT_MASKING_CASES'], @_);
101
102 my $self = $class->SUPER::new(@_);
103
104
105 $logic_names ||= [''];
106 if(ref($logic_names) ne 'ARRAY') {
107 throw("Reference to list of logic names argument expected.");
108 }
109
110 $self->{'repeat_mask_logic_names'} = $logic_names;
111 $self->{'soft_mask'} = $soft_mask;
112 $self->{'not_default_masking_cases'} = $not_default_masking_cases;
113 $self->{'not_default_masking_cases'} ||= {};
114
115 return $self;
116 }
117
118
119 =head2 repeat_mask_logic_names
120
121 Arg [1] : reference to list of strings $logic_names (optional)
122 Example : $rm_slice->repeat_mask_logic_name(['repeat_masker']);
123 Description: Getter/Setter for the logic_names of the repeats that are used
124 to mask this slices sequence.
125 Returntype : reference to list of strings
126 Exceptions : none
127 Caller : seq() method
128 Status : Stable
129
130 =cut
131
132 sub repeat_mask_logic_names {
133 my $self = shift;
134
135 if(@_) {
136 my $array = shift;
137 if(ref($array) ne 'ARRAY') {
138 throw('Reference to list of logic names argument expected.');
139 }
140 }
141
142 return $self->{'repeat_mask_logic_names'};
143 }
144
145
146 =head2 soft_mask
147
148 Arg [1] : boolean $soft_mask (optional)
149 Example : $rm_slice->soft_mask(0);
150 Description: Getter/Setter which is used to turn on/off softmasking of the
151 sequence returned by seq.
152 Returntype : boolean
153 Exceptions : none
154 Caller : seq() method
155 Status : Stable
156
157 =cut
158
159 sub soft_mask {
160 my $self = shift;
161 $self->{'soft_mask'} = shift if(@_);
162 return $self->{'soft_mask'} || 0;
163 }
164
165 =head2 not_default_masking_cases
166
167 Arg [1] : hash reference $not_default_masking_cases (optional, default is {})
168 The values are 0 or 1 for hard and soft masking respectively
169 The keys of the hash should be of 2 forms
170 "repeat_class_" . $repeat_consensus->repeat_class,
171 e.g. "repeat_class_SINE/MIR"
172 "repeat_name_" . $repeat_consensus->name
173 e.g. "repeat_name_MIR"
174 depending on which base you want to apply the not default masking either
175 the repeat_class or repeat_name. Both can be specified in the same hash
176 at the same time, but in that case, repeat_name setting has priority over
177 repeat_class. For example, you may have hard masking as default, and
178 you may want soft masking of all repeat_class SINE/MIR,
179 but repeat_name AluSp (which are also from repeat_class SINE/MIR)
180 Example : $rm_slice->not_default_masking_cases({"repeat_class_SINE/MIR" => 1,
181 "repeat_name_AluSp" => 0});
182 Description: Getter/Setter which is used to escape some repeat class or name from the default
183 masking in place.
184 Returntype : hash reference
185 Exceptions : none
186 Caller : seq() and subseq() methods
187 Status : Stable
188
189 =cut
190
191 sub not_default_masking_cases {
192 my $self = shift;
193 $self->{'not_default_masking_cases'} = shift if (@_);
194 return $self->{'not_default_masking_cases'};
195 }
196
197 =head2 seq
198
199 Arg [1] : none
200 Example : print $rmslice->seq(), "\n";
201 Description: Retrieves the entire repeat masked sequence for this slice.
202 See also the B<Bio::EnsEMBL::Slice> implementation of this
203 method.
204 Returntype : string
205 Exceptions : none
206 Caller : general
207 Status : Stable
208
209 =cut
210
211 sub seq {
212 my $self = shift;
213
214 #
215 # get all the features
216 #
217 my $repeats = $self->_get_repeat_features($self);
218 my $soft_mask = $self->soft_mask();
219 my $not_default_masking_cases = $self->not_default_masking_cases();
220
221 #
222 # get the dna
223 #
224 my $dna = $self->SUPER::seq(@_);
225
226 #
227 # mask the dna
228 #
229 $self->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases);
230 return $dna;
231 }
232
233 =head2 subseq
234
235 Arg [1] : none
236 Example : print $rmslice->subseq(1, 1000);
237 Description: Retrieves a repeat masked sequence from a specified subregion
238 of this slice. See also the B<Bio::EnsEMBL::Slice>
239 implementation of this method.
240 Returntype : string
241 Exceptions : none
242 Caller : general
243 Status : Stable
244
245 =cut
246
247 sub subseq {
248 my $self = shift;
249 my $start = shift;
250 my $end = shift;
251 my $strand = shift;
252
253 my $subsequence_slice = $self->sub_Slice($start, $end, $strand);
254
255 # If frequent subseqs happen on repeatMasked sequence this results in
256 # a lot of feature retrieval from the database. To avoid this, features
257 # are only retrieved from subslices with fixed space boundaries.
258 # The access happens in block to make cache hits more likely
259 # ONLY DO IF WE ARE CACHING
260
261 my $subslice;
262 if(! $self->adaptor()->db()->no_cache()) {
263
264 my $seq_region_slice = $self->seq_region_Slice();
265 # The blocksize can be defined on the top of this module.
266 my $block_min = ($subsequence_slice->start()-1) >> $BLOCK_PWR;
267 my $block_max = ($subsequence_slice->end()-1) >> $BLOCK_PWR;
268
269 my $sub_start = ($block_min << $BLOCK_PWR)+1;
270 my $sub_end = ($block_max+1)<<$BLOCK_PWR;
271 if ($sub_end > $seq_region_slice->length) {
272 $sub_end = $seq_region_slice->length ;
273 }
274 $subslice = $seq_region_slice->sub_Slice($sub_start, $sub_end);
275 }
276 else {
277 $subslice = $subsequence_slice;
278 }
279
280 my $repeats = $self->_get_repeat_features($subslice);
281 my $soft_mask = $self->soft_mask();
282 my $not_default_masking_cases = $self->not_default_masking_cases();
283 my $dna = $subsequence_slice->SUPER::seq();
284 $subsequence_slice->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases);
285 return $dna;
286 }
287
288 =head2 _get_repeat_features
289
290 Args [1] : Bio::EnsEMBL::Slice to fetch features for
291 Description : Gets repeat features for the given slice
292 Returntype : ArrayRef[Bio::EnsEMBL::RepeatFeature] array of repeats
293
294 =cut
295
296
297
298 sub _get_repeat_features {
299 my ($self, $slice) = @_;
300 my $logic_names = $self->repeat_mask_logic_names();
301 my @repeats;
302 foreach my $l (@$logic_names) {
303 push @repeats, @{$slice->get_all_RepeatFeatures($l)};
304 }
305 return \@repeats;
306 }
307
308 1;