Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/TopLevelAssemblyMapper.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 4 Genome Research Limited. All rights reserved. | |
| 5 | |
| 6 This software is distributed under a modified Apache license. | |
| 7 For license details, please see | |
| 8 | |
| 9 http://www.ensembl.org/info/about/code_licence.html | |
| 10 | |
| 11 =head1 CONTACT | |
| 12 | |
| 13 Please email comments or questions to the public Ensembl | |
| 14 developers list at <dev@ensembl.org>. | |
| 15 | |
| 16 Questions may also be sent to the Ensembl help desk at | |
| 17 <helpdesk@ensembl.org>. | |
| 18 | |
| 19 =cut | |
| 20 | |
| 21 =head1 NAME | |
| 22 | |
| 23 Bio::EnsEMBL::TopLevelAssemblyMapper - | |
| 24 Handles mapping between a given coordinate system and the toplevel | |
| 25 pseudo coordinate system. | |
| 26 | |
| 27 =head1 SYNOPSIS | |
| 28 | |
| 29 $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); | |
| 30 $asma = $db->get_AssemblyMapperAdaptor(); | |
| 31 $csa = $db->get_CoordSystemAdaptor(); | |
| 32 | |
| 33 my $toplevel = $cs_adaptor->fetch_by_name('toplevel'); | |
| 34 my $ctg_cs = $cs_adaptor->fetch_by_name('contig'); | |
| 35 | |
| 36 $asm_mapper = $map_adaptor->fetch_by_CoordSystems( $cs1, $cs2 ); | |
| 37 | |
| 38 # map to toplevel coord system for this region | |
| 39 @chr_coords = | |
| 40 $asm_mapper->map( 'AL30421.1.200.92341', 100, 10000, -1, $ctg_cs ); | |
| 41 | |
| 42 # list toplevel seq_region_ids for this region | |
| 43 @chr_ids = | |
| 44 $asm_mapper->list_ids( 'AL30421.1.200.92341', 1, 1000, -1, | |
| 45 $ctg_cs ); | |
| 46 | |
| 47 =head1 DESCRIPTION | |
| 48 | |
| 49 The TopLevelAssemblyMapper performs mapping between a provided | |
| 50 coordinate system and the toplevel pseudo cooordinate system. The | |
| 51 toplevel coordinate system is not a real coordinate system, but | |
| 52 represents the highest coordinate system that can be mapped to in a | |
| 53 given region. It is only possible to perform unidirectional mapping | |
| 54 using this mapper, because it does not make sense to map from the | |
| 55 toplevel coordinate system to another coordinate system. | |
| 56 | |
| 57 =head1 METHODS | |
| 58 | |
| 59 =cut | |
| 60 | |
| 61 | |
| 62 use strict; | |
| 63 use warnings; | |
| 64 | |
| 65 package Bio::EnsEMBL::TopLevelAssemblyMapper; | |
| 66 | |
| 67 use Bio::EnsEMBL::Utils::Exception qw(throw); | |
| 68 use Bio::EnsEMBL::Mapper; | |
| 69 use Bio::EnsEMBL::CoordSystem; | |
| 70 use Scalar::Util qw(weaken); | |
| 71 | |
| 72 =head2 new | |
| 73 | |
| 74 Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for | |
| 75 the database this mapper is using. | |
| 76 Arg [2] : Toplevel CoordSystem | |
| 77 Arg [3] : Other CoordSystem | |
| 78 Description: Creates a new TopLevelAssemblyMapper object | |
| 79 Returntype : Bio::EnsEMBL::DBSQL::TopLevelAssemblyMapper | |
| 80 Exceptions : throws if any of the 3 arguments are missing/ not | |
| 81 : of the correct type. | |
| 82 Caller : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor | |
| 83 Status : Stable | |
| 84 | |
| 85 =cut | |
| 86 | |
| 87 | |
| 88 sub new { | |
| 89 my ($caller, $adaptor, $toplevel_cs, $other_cs) = @_; | |
| 90 | |
| 91 my $class = ref($caller) || $caller; | |
| 92 | |
| 93 if(!ref($toplevel_cs)) { | |
| 94 throw('Toplevel CoordSystem argument expected.'); | |
| 95 } | |
| 96 if(!ref($other_cs)) { | |
| 97 throw('Other CoordSystem argument expected.'); | |
| 98 } | |
| 99 | |
| 100 if(!$toplevel_cs->is_top_level()) { | |
| 101 throw($toplevel_cs->name() . " is not the toplevel CoordSystem."); | |
| 102 } | |
| 103 if($other_cs->is_top_level()) { | |
| 104 throw("Other CoordSystem argument should not be toplevel CoordSystem."); | |
| 105 } | |
| 106 | |
| 107 my $cs_adaptor = $adaptor->db()->get_CoordSystemAdaptor(); | |
| 108 my $coord_systems = $cs_adaptor->fetch_all(); | |
| 109 | |
| 110 my $self = bless {'coord_systems' => $coord_systems, | |
| 111 'toplevel_cs' => $toplevel_cs, | |
| 112 'other_cs' => $other_cs}, $class; | |
| 113 | |
| 114 $self->adaptor($adaptor); | |
| 115 return $self; | |
| 116 } | |
| 117 | |
| 118 sub adaptor { | |
| 119 my $self = shift; | |
| 120 | |
| 121 weaken($self->{'adaptor'} = shift) if(@_); | |
| 122 | |
| 123 return $self->{'adaptor'}; | |
| 124 } | |
| 125 | |
| 126 =head2 map | |
| 127 | |
| 128 Arg [1] : string $frm_seq_region | |
| 129 The name of the sequence region to transform FROM | |
| 130 Arg [2] : int $frm_start | |
| 131 The start of the region to transform FROM | |
| 132 Arg [3] : int $frm_end | |
| 133 The end of the region to transform FROM | |
| 134 Arg [4] : int $strand | |
| 135 The strand of the region to transform FROM | |
| 136 Arg [5] : Bio::EnsEMBL::CoordSystem | |
| 137 The coordinate system to transform FROM | |
| 138 Arg [6] : if set will do a fastmap | |
| 139 Example : @coords = $mapper->map('X', 1_000_000, 2_000_000, | |
| 140 1, $chr_cs); | |
| 141 Description: Transforms coordinates from one coordinate system | |
| 142 to another. | |
| 143 Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or | |
| 144 Bio::EnsEMBL::Mapper:Gap objects | |
| 145 Exceptions : thrown if if the specified TO coordinate system is not one | |
| 146 of the coordinate systems associated with this mapper | |
| 147 Caller : general | |
| 148 Status : Stable | |
| 149 | |
| 150 =cut | |
| 151 | |
| 152 | |
| 153 sub map { | |
| 154 throw('Incorrect number of arguments.') if(@_ != 6 && @_ != 7); | |
| 155 | |
| 156 my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_strand, $frm_cs, | |
| 157 $fastmap) = @_; | |
| 158 | |
| 159 if($frm_cs->is_top_level()) { | |
| 160 throw("The toplevel CoordSystem can only be mapped TO, not FROM."); | |
| 161 } | |
| 162 | |
| 163 my @tmp; | |
| 164 push @tmp, $frm_seq_region_name; | |
| 165 my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; | |
| 166 | |
| 167 my $mapper = $self->{'mapper'}; | |
| 168 my $toplevel_cs = $self->{'toplevel_cs'}; | |
| 169 my $other_cs = $self->{'other_cs'}; | |
| 170 my $adaptor = $self->adaptor; | |
| 171 | |
| 172 if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { | |
| 173 throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . | |
| 174 " is neither the assembled nor the component coordinate system " . | |
| 175 " of this AssemblyMapper"); | |
| 176 } | |
| 177 | |
| 178 my $coord_systems = $self->{'coord_systems'}; | |
| 179 | |
| 180 my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); | |
| 181 | |
| 182 # | |
| 183 # TBD try to make this more efficient | |
| 184 # | |
| 185 my $from_rank = $other_cs->rank(); | |
| 186 foreach my $cs (@$coord_systems) { | |
| 187 last if($cs->rank >= $from_rank); | |
| 188 | |
| 189 #check if a mapping path even exists to this coordinate system | |
| 190 my @mapping_path = @{ $csa->get_mapping_path( $cs, $other_cs ) }; | |
| 191 | |
| 192 if(@mapping_path) { | |
| 193 | |
| 194 # Try to map to this coord system. If we get back any coordinates then | |
| 195 # it is our 'toplevel' that we were looking for | |
| 196 my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); | |
| 197 | |
| 198 if($fastmap) { | |
| 199 my @result = $mapper->fastmap($frm_seq_region_name, $frm_start, $frm_end, | |
| 200 $frm_strand, $frm_cs); | |
| 201 return @result if(@result); | |
| 202 } else { | |
| 203 my @coords = $mapper->map($frm_seq_region_name, $frm_start, $frm_end, | |
| 204 $frm_strand, $frm_cs); | |
| 205 | |
| 206 if(@coords > 1 || !$coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { | |
| 207 return @coords; | |
| 208 } | |
| 209 } | |
| 210 } | |
| 211 } | |
| 212 | |
| 213 # the toplevel coordinate system for the region requested *is* the | |
| 214 # requested region. | |
| 215 if($fastmap) { | |
| 216 return ($seq_region_id,$frm_start, $frm_end, $frm_strand, $other_cs); | |
| 217 } | |
| 218 return Bio::EnsEMBL::Mapper::Coordinate->new | |
| 219 ($seq_region_id,$frm_start,$frm_end, $frm_strand, $other_cs); | |
| 220 } | |
| 221 | |
| 222 # | |
| 223 # for polymorphism with AssemblyMapper | |
| 224 # | |
| 225 =head2 flush | |
| 226 | |
| 227 Args : none | |
| 228 Example : none | |
| 229 Description: polymorphism with AssemblyMapper, does nothing | |
| 230 Returntype : none | |
| 231 Exceptions : none | |
| 232 Status : Stable | |
| 233 | |
| 234 =cut | |
| 235 | |
| 236 sub flush {} | |
| 237 | |
| 238 =head2 fastmap | |
| 239 | |
| 240 Arg [1] : string $frm_seq_region | |
| 241 The name of the sequence region to transform FROM | |
| 242 Arg [2] : int $frm_start | |
| 243 The start of the region to transform FROM | |
| 244 Arg [3] : int $frm_end | |
| 245 The end of the region to transform FROM | |
| 246 Arg [4] : int $strand | |
| 247 The strand of the region to transform FROM | |
| 248 Arg [5] : Bio::EnsEMBL::CoordSystem | |
| 249 The coordinate system to transform FROM | |
| 250 Example : @coords = $mapper->fastmap('X', 1_000_000, 2_000_000, | |
| 251 1, $chr_cs); | |
| 252 Description: Transforms coordinates from one coordinate system | |
| 253 to another. | |
| 254 Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or | |
| 255 Bio::EnsEMBL::Mapper:Gap objects | |
| 256 Exceptions : thrown if if the specified TO coordinate system is not one | |
| 257 of the coordinate systems associated with this mapper | |
| 258 Caller : general | |
| 259 Status : Stable | |
| 260 | |
| 261 =cut | |
| 262 | |
| 263 sub fastmap { | |
| 264 my $self = shift; | |
| 265 return $self->map(@_,1); | |
| 266 } | |
| 267 | |
| 268 =head2 assembled_CoordSystem | |
| 269 | |
| 270 Arg [1] : none | |
| 271 Example : $cs = $mapper->assembled_CoordSystem | |
| 272 Description: Retrieves the assembled CoordSystem from this mapper | |
| 273 Returntype : Bio::EnsEMBL::CoordSystem | |
| 274 Exceptions : none | |
| 275 Caller : internal, AssemblyMapperAdaptor | |
| 276 Status : Stable | |
| 277 | |
| 278 =cut | |
| 279 | |
| 280 sub assembled_CoordSystem { | |
| 281 my $self = shift; | |
| 282 return $self->{'toplevel_cs'}; | |
| 283 } | |
| 284 | |
| 285 =head2 component_CoordSystem | |
| 286 | |
| 287 Arg [1] : none | |
| 288 Example : $cs = $mapper->component_CoordSystem | |
| 289 Description: Retrieves the component CoordSystem from this mapper | |
| 290 Returntype : Bio::EnsEMBL::CoordSystem | |
| 291 Exceptions : none | |
| 292 Caller : internal, AssemblyMapperAdaptor | |
| 293 Status : Stable | |
| 294 | |
| 295 =cut | |
| 296 | |
| 297 sub component_CoordSystem { | |
| 298 my $self = shift; | |
| 299 return $self->{'other_cs'}; | |
| 300 } | |
| 301 | |
| 302 | |
| 303 sub _list { | |
| 304 my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_cs, $seq_regions) = @_; | |
| 305 | |
| 306 my $mapper = $self->{'mapper'}; | |
| 307 my $toplevel_cs = $self->{'toplevel_cs'}; | |
| 308 my $other_cs = $self->{'other_cs'}; | |
| 309 my $adaptor = $self->adaptor; | |
| 310 | |
| 311 if($frm_cs->is_top_level()) { | |
| 312 throw("The toplevel CoordSystem can only be mapped TO, not FROM."); | |
| 313 } | |
| 314 if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { | |
| 315 throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . | |
| 316 " is neither the assembled nor the component coordinate system " . | |
| 317 " of this AssemblyMapper"); | |
| 318 } | |
| 319 | |
| 320 my $coord_systems = $self->{'coord_systems'}; | |
| 321 my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); | |
| 322 | |
| 323 # | |
| 324 # TBD try to make this more efficient | |
| 325 # | |
| 326 my $from_rank = $other_cs->rank(); | |
| 327 foreach my $cs (@$coord_systems) { | |
| 328 last if($cs->rank >= $from_rank); | |
| 329 | |
| 330 #check if a mapping path even exists to this coordinate system | |
| 331 my @mapping_path = @{ $csa->get_mapping_path( $cs, $other_cs ) }; | |
| 332 | |
| 333 if(@mapping_path) { | |
| 334 | |
| 335 # Try to map to this coord system. If we get back any coordinates then | |
| 336 # it is our 'toplevel' that we were looking for | |
| 337 my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); | |
| 338 | |
| 339 my @result; | |
| 340 | |
| 341 my @tmp; | |
| 342 push @tmp, $frm_seq_region_name; | |
| 343 my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; | |
| 344 | |
| 345 if($seq_regions) { | |
| 346 @result = $mapper->list_seq_regions($frm_seq_region_name, $frm_start, | |
| 347 $frm_end, $frm_cs); | |
| 348 } else { | |
| 349 @result = $mapper->list_ids($frm_seq_region_name, $frm_start, | |
| 350 $frm_end, $frm_cs); | |
| 351 } | |
| 352 | |
| 353 return @result if(@result); | |
| 354 } | |
| 355 } | |
| 356 | |
| 357 # the toplevel coordinate system for the region requested *is* the | |
| 358 return ($frm_seq_region_name); | |
| 359 | |
| 360 | |
| 361 # requested region. | |
| 362 if($seq_regions) { | |
| 363 return ($frm_seq_region_name); | |
| 364 } | |
| 365 | |
| 366 #this seems a bit silly and inefficient, but it is probably never | |
| 367 #called anyway. | |
| 368 my $slice_adaptor = $adaptor->db()->get_SliceAdaptor(); | |
| 369 my $slice = $slice_adaptor->fetch_by_region($other_cs->name(), | |
| 370 $frm_seq_region_name, | |
| 371 undef,undef,undef,$other_cs); | |
| 372 return ($slice_adaptor->get_seq_region_id($slice)); | |
| 373 } | |
| 374 | |
| 375 | |
| 376 | |
| 377 =head2 list_seq_regions | |
| 378 | |
| 379 Arg [1] : string $frm_seq_region | |
| 380 The name of the sequence region of interest | |
| 381 Arg [2] : int $frm_start | |
| 382 The start of the region of interest | |
| 383 Arg [3] : int $frm_end | |
| 384 The end of the region to transform of interest | |
| 385 Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs | |
| 386 The coordinate system to obtain overlapping ids of | |
| 387 Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$ctg_cs)) {...} | |
| 388 Description: Retrieves a list of overlapping seq_region names | |
| 389 of another coordinate system. This is the same as the | |
| 390 list_ids method but uses seq_region names rather internal ids | |
| 391 Returntype : List of strings | |
| 392 Exceptions : none | |
| 393 Caller : general | |
| 394 Status : Stable | |
| 395 | |
| 396 =cut | |
| 397 | |
| 398 sub list_seq_regions { | |
| 399 throw('Incorrect number of arguments.') if(@_ != 5); | |
| 400 return _list(@_,1); | |
| 401 } | |
| 402 | |
| 403 | |
| 404 =head2 list_ids | |
| 405 | |
| 406 Arg [1] : string $frm_seq_region | |
| 407 The name of the sequence region of interest. | |
| 408 Arg [2] : int $frm_start | |
| 409 The start of the region of interest | |
| 410 Arg [3] : int $frm_end | |
| 411 The end of the region to transform of interest | |
| 412 Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs | |
| 413 The coordinate system to obtain overlapping ids of | |
| 414 Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$chr_cs)) {...} | |
| 415 Description: Retrieves a list of overlapping seq_region internal identifiers | |
| 416 of another coordinate system. This is the same as the | |
| 417 list_seq_regions method but uses internal identfiers rather | |
| 418 than seq_region strings | |
| 419 Returntype : List of ints | |
| 420 Exceptions : thrown if the from CoordSystem is the toplevel coord system | |
| 421 thrown if the from CoordSystem is not the one used in the mapper | |
| 422 Caller : general | |
| 423 Status : Stable | |
| 424 | |
| 425 =cut | |
| 426 | |
| 427 sub list_ids { | |
| 428 throw('Incorrect number of arguments.') if(@_ != 5); | |
| 429 return _list(@_,0); | |
| 430 } | |
| 431 | |
| 432 | |
| 433 | |
| 434 | |
| 435 | |
| 436 1; |
