Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/RepeatMaskedSlice.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::RepeatMaskedSlice - Arbitary Slice of a genome | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 $sa = $db->get_SliceAdaptor(); | |
28 | |
29 $slice = | |
30 $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); | |
31 | |
32 $repeat_masked_slice = $slice->get_repeatmasked_seq(); | |
33 | |
34 # get repeat masked sequence: | |
35 my $dna = $repeat_masked_slice->seq(); | |
36 $dna = $repeat_masked_slice->subseq( 1, 1000 ); | |
37 | |
38 =head1 DESCRIPTION | |
39 | |
40 This is a specialised Bio::EnsEMBL::Slice class that is used to retrieve | |
41 repeat masked genomic sequence rather than normal genomic sequence. | |
42 | |
43 =head1 METHODS | |
44 | |
45 =cut | |
46 | |
47 package Bio::EnsEMBL::RepeatMaskedSlice; | |
48 | |
49 use strict; | |
50 use warnings; | |
51 | |
52 use Bio::EnsEMBL::Slice; | |
53 use Bio::EnsEMBL::Utils::Argument qw(rearrange); | |
54 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
55 use Bio::EnsEMBL::Utils::Exception; | |
56 | |
57 use vars qw(@ISA); | |
58 | |
59 @ISA = ('Bio::EnsEMBL::Slice'); | |
60 | |
61 # The BLOCK_PWR is the lob_bin of the chunksize where you want your repeat features | |
62 # to be retreived. This will create repeat feature retrieval calls that are likely | |
63 # to be on the same slice and hopefully create cache hits and less database traffic | |
64 my $BLOCK_PWR = 18; | |
65 | |
66 | |
67 | |
68 =head2 new | |
69 | |
70 Arg [-REPEAT_MASK] : The logic name of the repeats to be used for masking. | |
71 If not provided, all repeats in the database are used. | |
72 Arg [...] : Named superclass arguments. See B<Bio::EnsEMBL::Slice>. | |
73 Example : my $slice = Bio::EnsEMBL::RepeatMaskedSlice->new | |
74 (-START => $start, | |
75 -END => $end, | |
76 -STRAND => $strand, | |
77 -SEQ_REGION_NAME => $seq_region, | |
78 -SEQ_REGION_LENGTH => $seq_region_length, | |
79 -COORD_SYSTEM => $cs, | |
80 -ADAPTOR => $adaptor, | |
81 -REPEAT_MASK => ['repeat_masker'], | |
82 -SOFT_MASK => 1, | |
83 -NOT_DEFAULT_MASKING_CASES => {"repeat_class_SINE/MIR" => 1, | |
84 "repeat_name_AluSp" => 0}); | |
85 Description: Creates a Slice which behaves exactly as a normal slice but | |
86 that returns repeat masked sequence from the seq method. | |
87 Returntype : Bio::EnsEMBL::RepeatMaskedSlice | |
88 Exceptions : none | |
89 Caller : RawComputes (PredictionTranscript creation code). | |
90 Status : Stable | |
91 | |
92 =cut | |
93 | |
94 sub new { | |
95 my $caller = shift; | |
96 my $class = ref($caller) || $caller; | |
97 | |
98 my ($logic_names, $soft_mask, $not_default_masking_cases) = rearrange(['REPEAT_MASK', | |
99 'SOFT_MASK', | |
100 'NOT_DEFAULT_MASKING_CASES'], @_); | |
101 | |
102 my $self = $class->SUPER::new(@_); | |
103 | |
104 | |
105 $logic_names ||= ['']; | |
106 if(ref($logic_names) ne 'ARRAY') { | |
107 throw("Reference to list of logic names argument expected."); | |
108 } | |
109 | |
110 $self->{'repeat_mask_logic_names'} = $logic_names; | |
111 $self->{'soft_mask'} = $soft_mask; | |
112 $self->{'not_default_masking_cases'} = $not_default_masking_cases; | |
113 $self->{'not_default_masking_cases'} ||= {}; | |
114 | |
115 return $self; | |
116 } | |
117 | |
118 | |
119 =head2 repeat_mask_logic_names | |
120 | |
121 Arg [1] : reference to list of strings $logic_names (optional) | |
122 Example : $rm_slice->repeat_mask_logic_name(['repeat_masker']); | |
123 Description: Getter/Setter for the logic_names of the repeats that are used | |
124 to mask this slices sequence. | |
125 Returntype : reference to list of strings | |
126 Exceptions : none | |
127 Caller : seq() method | |
128 Status : Stable | |
129 | |
130 =cut | |
131 | |
132 sub repeat_mask_logic_names { | |
133 my $self = shift; | |
134 | |
135 if(@_) { | |
136 my $array = shift; | |
137 if(ref($array) ne 'ARRAY') { | |
138 throw('Reference to list of logic names argument expected.'); | |
139 } | |
140 } | |
141 | |
142 return $self->{'repeat_mask_logic_names'}; | |
143 } | |
144 | |
145 | |
146 =head2 soft_mask | |
147 | |
148 Arg [1] : boolean $soft_mask (optional) | |
149 Example : $rm_slice->soft_mask(0); | |
150 Description: Getter/Setter which is used to turn on/off softmasking of the | |
151 sequence returned by seq. | |
152 Returntype : boolean | |
153 Exceptions : none | |
154 Caller : seq() method | |
155 Status : Stable | |
156 | |
157 =cut | |
158 | |
159 sub soft_mask { | |
160 my $self = shift; | |
161 $self->{'soft_mask'} = shift if(@_); | |
162 return $self->{'soft_mask'} || 0; | |
163 } | |
164 | |
165 =head2 not_default_masking_cases | |
166 | |
167 Arg [1] : hash reference $not_default_masking_cases (optional, default is {}) | |
168 The values are 0 or 1 for hard and soft masking respectively | |
169 The keys of the hash should be of 2 forms | |
170 "repeat_class_" . $repeat_consensus->repeat_class, | |
171 e.g. "repeat_class_SINE/MIR" | |
172 "repeat_name_" . $repeat_consensus->name | |
173 e.g. "repeat_name_MIR" | |
174 depending on which base you want to apply the not default masking either | |
175 the repeat_class or repeat_name. Both can be specified in the same hash | |
176 at the same time, but in that case, repeat_name setting has priority over | |
177 repeat_class. For example, you may have hard masking as default, and | |
178 you may want soft masking of all repeat_class SINE/MIR, | |
179 but repeat_name AluSp (which are also from repeat_class SINE/MIR) | |
180 Example : $rm_slice->not_default_masking_cases({"repeat_class_SINE/MIR" => 1, | |
181 "repeat_name_AluSp" => 0}); | |
182 Description: Getter/Setter which is used to escape some repeat class or name from the default | |
183 masking in place. | |
184 Returntype : hash reference | |
185 Exceptions : none | |
186 Caller : seq() and subseq() methods | |
187 Status : Stable | |
188 | |
189 =cut | |
190 | |
191 sub not_default_masking_cases { | |
192 my $self = shift; | |
193 $self->{'not_default_masking_cases'} = shift if (@_); | |
194 return $self->{'not_default_masking_cases'}; | |
195 } | |
196 | |
197 =head2 seq | |
198 | |
199 Arg [1] : none | |
200 Example : print $rmslice->seq(), "\n"; | |
201 Description: Retrieves the entire repeat masked sequence for this slice. | |
202 See also the B<Bio::EnsEMBL::Slice> implementation of this | |
203 method. | |
204 Returntype : string | |
205 Exceptions : none | |
206 Caller : general | |
207 Status : Stable | |
208 | |
209 =cut | |
210 | |
211 sub seq { | |
212 my $self = shift; | |
213 | |
214 # | |
215 # get all the features | |
216 # | |
217 my $repeats = $self->_get_repeat_features($self); | |
218 my $soft_mask = $self->soft_mask(); | |
219 my $not_default_masking_cases = $self->not_default_masking_cases(); | |
220 | |
221 # | |
222 # get the dna | |
223 # | |
224 my $dna = $self->SUPER::seq(@_); | |
225 | |
226 # | |
227 # mask the dna | |
228 # | |
229 $self->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases); | |
230 return $dna; | |
231 } | |
232 | |
233 =head2 subseq | |
234 | |
235 Arg [1] : none | |
236 Example : print $rmslice->subseq(1, 1000); | |
237 Description: Retrieves a repeat masked sequence from a specified subregion | |
238 of this slice. See also the B<Bio::EnsEMBL::Slice> | |
239 implementation of this method. | |
240 Returntype : string | |
241 Exceptions : none | |
242 Caller : general | |
243 Status : Stable | |
244 | |
245 =cut | |
246 | |
247 sub subseq { | |
248 my $self = shift; | |
249 my $start = shift; | |
250 my $end = shift; | |
251 my $strand = shift; | |
252 | |
253 my $subsequence_slice = $self->sub_Slice($start, $end, $strand); | |
254 | |
255 # If frequent subseqs happen on repeatMasked sequence this results in | |
256 # a lot of feature retrieval from the database. To avoid this, features | |
257 # are only retrieved from subslices with fixed space boundaries. | |
258 # The access happens in block to make cache hits more likely | |
259 # ONLY DO IF WE ARE CACHING | |
260 | |
261 my $subslice; | |
262 if(! $self->adaptor()->db()->no_cache()) { | |
263 | |
264 my $seq_region_slice = $self->seq_region_Slice(); | |
265 # The blocksize can be defined on the top of this module. | |
266 my $block_min = ($subsequence_slice->start()-1) >> $BLOCK_PWR; | |
267 my $block_max = ($subsequence_slice->end()-1) >> $BLOCK_PWR; | |
268 | |
269 my $sub_start = ($block_min << $BLOCK_PWR)+1; | |
270 my $sub_end = ($block_max+1)<<$BLOCK_PWR; | |
271 if ($sub_end > $seq_region_slice->length) { | |
272 $sub_end = $seq_region_slice->length ; | |
273 } | |
274 $subslice = $seq_region_slice->sub_Slice($sub_start, $sub_end); | |
275 } | |
276 else { | |
277 $subslice = $subsequence_slice; | |
278 } | |
279 | |
280 my $repeats = $self->_get_repeat_features($subslice); | |
281 my $soft_mask = $self->soft_mask(); | |
282 my $not_default_masking_cases = $self->not_default_masking_cases(); | |
283 my $dna = $subsequence_slice->SUPER::seq(); | |
284 $subsequence_slice->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases); | |
285 return $dna; | |
286 } | |
287 | |
288 =head2 _get_repeat_features | |
289 | |
290 Args [1] : Bio::EnsEMBL::Slice to fetch features for | |
291 Description : Gets repeat features for the given slice | |
292 Returntype : ArrayRef[Bio::EnsEMBL::RepeatFeature] array of repeats | |
293 | |
294 =cut | |
295 | |
296 | |
297 | |
298 sub _get_repeat_features { | |
299 my ($self, $slice) = @_; | |
300 my $logic_names = $self->repeat_mask_logic_names(); | |
301 my @repeats; | |
302 foreach my $l (@$logic_names) { | |
303 push @repeats, @{$slice->get_all_RepeatFeatures($l)}; | |
304 } | |
305 return \@repeats; | |
306 } | |
307 | |
308 1; |