0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::RepeatMaskedSlice - Arbitary Slice of a genome
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 $sa = $db->get_SliceAdaptor();
|
|
28
|
|
29 $slice =
|
|
30 $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 );
|
|
31
|
|
32 $repeat_masked_slice = $slice->get_repeatmasked_seq();
|
|
33
|
|
34 # get repeat masked sequence:
|
|
35 my $dna = $repeat_masked_slice->seq();
|
|
36 $dna = $repeat_masked_slice->subseq( 1, 1000 );
|
|
37
|
|
38 =head1 DESCRIPTION
|
|
39
|
|
40 This is a specialised Bio::EnsEMBL::Slice class that is used to retrieve
|
|
41 repeat masked genomic sequence rather than normal genomic sequence.
|
|
42
|
|
43 =head1 METHODS
|
|
44
|
|
45 =cut
|
|
46
|
|
47 package Bio::EnsEMBL::RepeatMaskedSlice;
|
|
48
|
|
49 use strict;
|
|
50 use warnings;
|
|
51
|
|
52 use Bio::EnsEMBL::Slice;
|
|
53 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
|
|
54 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
55 use Bio::EnsEMBL::Utils::Exception;
|
|
56
|
|
57 use vars qw(@ISA);
|
|
58
|
|
59 @ISA = ('Bio::EnsEMBL::Slice');
|
|
60
|
|
61 # The BLOCK_PWR is the lob_bin of the chunksize where you want your repeat features
|
|
62 # to be retreived. This will create repeat feature retrieval calls that are likely
|
|
63 # to be on the same slice and hopefully create cache hits and less database traffic
|
|
64 my $BLOCK_PWR = 18;
|
|
65
|
|
66
|
|
67
|
|
68 =head2 new
|
|
69
|
|
70 Arg [-REPEAT_MASK] : The logic name of the repeats to be used for masking.
|
|
71 If not provided, all repeats in the database are used.
|
|
72 Arg [...] : Named superclass arguments. See B<Bio::EnsEMBL::Slice>.
|
|
73 Example : my $slice = Bio::EnsEMBL::RepeatMaskedSlice->new
|
|
74 (-START => $start,
|
|
75 -END => $end,
|
|
76 -STRAND => $strand,
|
|
77 -SEQ_REGION_NAME => $seq_region,
|
|
78 -SEQ_REGION_LENGTH => $seq_region_length,
|
|
79 -COORD_SYSTEM => $cs,
|
|
80 -ADAPTOR => $adaptor,
|
|
81 -REPEAT_MASK => ['repeat_masker'],
|
|
82 -SOFT_MASK => 1,
|
|
83 -NOT_DEFAULT_MASKING_CASES => {"repeat_class_SINE/MIR" => 1,
|
|
84 "repeat_name_AluSp" => 0});
|
|
85 Description: Creates a Slice which behaves exactly as a normal slice but
|
|
86 that returns repeat masked sequence from the seq method.
|
|
87 Returntype : Bio::EnsEMBL::RepeatMaskedSlice
|
|
88 Exceptions : none
|
|
89 Caller : RawComputes (PredictionTranscript creation code).
|
|
90 Status : Stable
|
|
91
|
|
92 =cut
|
|
93
|
|
94 sub new {
|
|
95 my $caller = shift;
|
|
96 my $class = ref($caller) || $caller;
|
|
97
|
|
98 my ($logic_names, $soft_mask, $not_default_masking_cases) = rearrange(['REPEAT_MASK',
|
|
99 'SOFT_MASK',
|
|
100 'NOT_DEFAULT_MASKING_CASES'], @_);
|
|
101
|
|
102 my $self = $class->SUPER::new(@_);
|
|
103
|
|
104
|
|
105 $logic_names ||= [''];
|
|
106 if(ref($logic_names) ne 'ARRAY') {
|
|
107 throw("Reference to list of logic names argument expected.");
|
|
108 }
|
|
109
|
|
110 $self->{'repeat_mask_logic_names'} = $logic_names;
|
|
111 $self->{'soft_mask'} = $soft_mask;
|
|
112 $self->{'not_default_masking_cases'} = $not_default_masking_cases;
|
|
113 $self->{'not_default_masking_cases'} ||= {};
|
|
114
|
|
115 return $self;
|
|
116 }
|
|
117
|
|
118
|
|
119 =head2 repeat_mask_logic_names
|
|
120
|
|
121 Arg [1] : reference to list of strings $logic_names (optional)
|
|
122 Example : $rm_slice->repeat_mask_logic_name(['repeat_masker']);
|
|
123 Description: Getter/Setter for the logic_names of the repeats that are used
|
|
124 to mask this slices sequence.
|
|
125 Returntype : reference to list of strings
|
|
126 Exceptions : none
|
|
127 Caller : seq() method
|
|
128 Status : Stable
|
|
129
|
|
130 =cut
|
|
131
|
|
132 sub repeat_mask_logic_names {
|
|
133 my $self = shift;
|
|
134
|
|
135 if(@_) {
|
|
136 my $array = shift;
|
|
137 if(ref($array) ne 'ARRAY') {
|
|
138 throw('Reference to list of logic names argument expected.');
|
|
139 }
|
|
140 }
|
|
141
|
|
142 return $self->{'repeat_mask_logic_names'};
|
|
143 }
|
|
144
|
|
145
|
|
146 =head2 soft_mask
|
|
147
|
|
148 Arg [1] : boolean $soft_mask (optional)
|
|
149 Example : $rm_slice->soft_mask(0);
|
|
150 Description: Getter/Setter which is used to turn on/off softmasking of the
|
|
151 sequence returned by seq.
|
|
152 Returntype : boolean
|
|
153 Exceptions : none
|
|
154 Caller : seq() method
|
|
155 Status : Stable
|
|
156
|
|
157 =cut
|
|
158
|
|
159 sub soft_mask {
|
|
160 my $self = shift;
|
|
161 $self->{'soft_mask'} = shift if(@_);
|
|
162 return $self->{'soft_mask'} || 0;
|
|
163 }
|
|
164
|
|
165 =head2 not_default_masking_cases
|
|
166
|
|
167 Arg [1] : hash reference $not_default_masking_cases (optional, default is {})
|
|
168 The values are 0 or 1 for hard and soft masking respectively
|
|
169 The keys of the hash should be of 2 forms
|
|
170 "repeat_class_" . $repeat_consensus->repeat_class,
|
|
171 e.g. "repeat_class_SINE/MIR"
|
|
172 "repeat_name_" . $repeat_consensus->name
|
|
173 e.g. "repeat_name_MIR"
|
|
174 depending on which base you want to apply the not default masking either
|
|
175 the repeat_class or repeat_name. Both can be specified in the same hash
|
|
176 at the same time, but in that case, repeat_name setting has priority over
|
|
177 repeat_class. For example, you may have hard masking as default, and
|
|
178 you may want soft masking of all repeat_class SINE/MIR,
|
|
179 but repeat_name AluSp (which are also from repeat_class SINE/MIR)
|
|
180 Example : $rm_slice->not_default_masking_cases({"repeat_class_SINE/MIR" => 1,
|
|
181 "repeat_name_AluSp" => 0});
|
|
182 Description: Getter/Setter which is used to escape some repeat class or name from the default
|
|
183 masking in place.
|
|
184 Returntype : hash reference
|
|
185 Exceptions : none
|
|
186 Caller : seq() and subseq() methods
|
|
187 Status : Stable
|
|
188
|
|
189 =cut
|
|
190
|
|
191 sub not_default_masking_cases {
|
|
192 my $self = shift;
|
|
193 $self->{'not_default_masking_cases'} = shift if (@_);
|
|
194 return $self->{'not_default_masking_cases'};
|
|
195 }
|
|
196
|
|
197 =head2 seq
|
|
198
|
|
199 Arg [1] : none
|
|
200 Example : print $rmslice->seq(), "\n";
|
|
201 Description: Retrieves the entire repeat masked sequence for this slice.
|
|
202 See also the B<Bio::EnsEMBL::Slice> implementation of this
|
|
203 method.
|
|
204 Returntype : string
|
|
205 Exceptions : none
|
|
206 Caller : general
|
|
207 Status : Stable
|
|
208
|
|
209 =cut
|
|
210
|
|
211 sub seq {
|
|
212 my $self = shift;
|
|
213
|
|
214 #
|
|
215 # get all the features
|
|
216 #
|
|
217 my $repeats = $self->_get_repeat_features($self);
|
|
218 my $soft_mask = $self->soft_mask();
|
|
219 my $not_default_masking_cases = $self->not_default_masking_cases();
|
|
220
|
|
221 #
|
|
222 # get the dna
|
|
223 #
|
|
224 my $dna = $self->SUPER::seq(@_);
|
|
225
|
|
226 #
|
|
227 # mask the dna
|
|
228 #
|
|
229 $self->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases);
|
|
230 return $dna;
|
|
231 }
|
|
232
|
|
233 =head2 subseq
|
|
234
|
|
235 Arg [1] : none
|
|
236 Example : print $rmslice->subseq(1, 1000);
|
|
237 Description: Retrieves a repeat masked sequence from a specified subregion
|
|
238 of this slice. See also the B<Bio::EnsEMBL::Slice>
|
|
239 implementation of this method.
|
|
240 Returntype : string
|
|
241 Exceptions : none
|
|
242 Caller : general
|
|
243 Status : Stable
|
|
244
|
|
245 =cut
|
|
246
|
|
247 sub subseq {
|
|
248 my $self = shift;
|
|
249 my $start = shift;
|
|
250 my $end = shift;
|
|
251 my $strand = shift;
|
|
252
|
|
253 my $subsequence_slice = $self->sub_Slice($start, $end, $strand);
|
|
254
|
|
255 # If frequent subseqs happen on repeatMasked sequence this results in
|
|
256 # a lot of feature retrieval from the database. To avoid this, features
|
|
257 # are only retrieved from subslices with fixed space boundaries.
|
|
258 # The access happens in block to make cache hits more likely
|
|
259 # ONLY DO IF WE ARE CACHING
|
|
260
|
|
261 my $subslice;
|
|
262 if(! $self->adaptor()->db()->no_cache()) {
|
|
263
|
|
264 my $seq_region_slice = $self->seq_region_Slice();
|
|
265 # The blocksize can be defined on the top of this module.
|
|
266 my $block_min = ($subsequence_slice->start()-1) >> $BLOCK_PWR;
|
|
267 my $block_max = ($subsequence_slice->end()-1) >> $BLOCK_PWR;
|
|
268
|
|
269 my $sub_start = ($block_min << $BLOCK_PWR)+1;
|
|
270 my $sub_end = ($block_max+1)<<$BLOCK_PWR;
|
|
271 if ($sub_end > $seq_region_slice->length) {
|
|
272 $sub_end = $seq_region_slice->length ;
|
|
273 }
|
|
274 $subslice = $seq_region_slice->sub_Slice($sub_start, $sub_end);
|
|
275 }
|
|
276 else {
|
|
277 $subslice = $subsequence_slice;
|
|
278 }
|
|
279
|
|
280 my $repeats = $self->_get_repeat_features($subslice);
|
|
281 my $soft_mask = $self->soft_mask();
|
|
282 my $not_default_masking_cases = $self->not_default_masking_cases();
|
|
283 my $dna = $subsequence_slice->SUPER::seq();
|
|
284 $subsequence_slice->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases);
|
|
285 return $dna;
|
|
286 }
|
|
287
|
|
288 =head2 _get_repeat_features
|
|
289
|
|
290 Args [1] : Bio::EnsEMBL::Slice to fetch features for
|
|
291 Description : Gets repeat features for the given slice
|
|
292 Returntype : ArrayRef[Bio::EnsEMBL::RepeatFeature] array of repeats
|
|
293
|
|
294 =cut
|
|
295
|
|
296
|
|
297
|
|
298 sub _get_repeat_features {
|
|
299 my ($self, $slice) = @_;
|
|
300 my $logic_names = $self->repeat_mask_logic_names();
|
|
301 my @repeats;
|
|
302 foreach my $l (@$logic_names) {
|
|
303 push @repeats, @{$slice->get_all_RepeatFeatures($l)};
|
|
304 }
|
|
305 return \@repeats;
|
|
306 }
|
|
307
|
|
308 1;
|