Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Utils/AssemblyProjector.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 4 Genome Research Limited. All rights reserved. | |
| 5 | |
| 6 This software is distributed under a modified Apache license. | |
| 7 For license details, please see | |
| 8 | |
| 9 http://www.ensembl.org/info/about/code_licence.html | |
| 10 | |
| 11 =head1 CONTACT | |
| 12 | |
| 13 Please email comments or questions to the public Ensembl | |
| 14 developers list at <dev@ensembl.org>. | |
| 15 | |
| 16 Questions may also be sent to the Ensembl help desk at | |
| 17 <helpdesk@ensembl.org>. | |
| 18 | |
| 19 =cut | |
| 20 | |
| 21 =head1 NAME | |
| 22 | |
| 23 Bio::EnsEMBL::Utils::AssemblyProjector - | |
| 24 utility class to post-process projections from one assembly to another | |
| 25 | |
| 26 =head1 SYNOPSIS | |
| 27 | |
| 28 # connect to an old database | |
| 29 my $dba_old = new Bio::EnsEMBL::DBSQL::DBAdaptor( | |
| 30 -host => 'ensembldb.ensembl.org', | |
| 31 -port => 3306, | |
| 32 -user => ensro, | |
| 33 -dbname => 'mus_musculus_core_46_36g', | |
| 34 -group => 'core_old', | |
| 35 ); | |
| 36 | |
| 37 # connect to the new database containing the mapping between old and | |
| 38 # new assembly | |
| 39 my $dba_new = new Bio::EnsEMBL::DBSQL::DBAdaptor( | |
| 40 -host => 'ensembldb.ensembl.org', | |
| 41 -port => 3306, | |
| 42 -user => ensro, | |
| 43 -dbname => 'mus_musculus_core_47_37', | |
| 44 -group => 'core_new', | |
| 45 ); | |
| 46 | |
| 47 my $assembly_projector = Bio::EnsEMBL::Utils::AssemblyProjector->new( | |
| 48 -OLD_ASSEMBLY => 'NCBIM36', | |
| 49 -NEW_ASSEMBLY => 'NCBIM37', | |
| 50 -ADAPTOR => $dba_new, | |
| 51 -EXTERNAL_SOURCE => 1, | |
| 52 -MERGE_FRAGMENTS => 1, | |
| 53 -CHECK_LENGTH => 0, | |
| 54 ); | |
| 55 | |
| 56 # fetch a slice on the old assembly | |
| 57 my $slice_adaptor = $dba_old->get_SliceAdaptor; | |
| 58 my $slice = | |
| 59 $slice_adaptor->fetch_by_region( 'chromosome', 1, undef, undef, | |
| 60 undef, 'NCBIM36' ); | |
| 61 | |
| 62 my $new_slice = $assembly_projector->old_to_new($slice); | |
| 63 | |
| 64 print $new_slice->name, " (", $assembly_projector->last_status, ")\n"; | |
| 65 | |
| 66 =head1 DESCRIPTION | |
| 67 | |
| 68 This class implements some utility functions for converting coordinates | |
| 69 between assemblies. A mapping between the two assemblies has to present | |
| 70 the database for this to work, see the 'Related Modules' section below | |
| 71 on how to generate the mapping. | |
| 72 | |
| 73 In addition to the "raw" projecting of features and slices, the methods | |
| 74 in this module also apply some sensible rules to the results of the | |
| 75 projection (like discarding unwanted results or merging fragmented | |
| 76 projections). These are the rules (depending on configuration): | |
| 77 | |
| 78 Discard the projected feature/slice if: | |
| 79 | |
| 80 1. it doesn't project at all (no segments returned) | |
| 81 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more | |
| 82 than one segment) | |
| 83 3. [if CHECK_LENGTH is set] the projection doesn't have the same | |
| 84 length as the original feature/slice | |
| 85 4. all segments are on same chromosome and strand | |
| 86 | |
| 87 If a projection fails any of these rules, undef is returned instead of | |
| 88 a projected feature/slice. You can use the last_status() method to find | |
| 89 out about the results of the rules tests. | |
| 90 | |
| 91 Also note that when projecting features, only a shallow projection is | |
| 92 performed, i.e. other features attached to your features (e.g. the | |
| 93 transcripts of a gene) are not projected automatically, so it will be | |
| 94 the responsability of the user code project all levels of features | |
| 95 involved. | |
| 96 | |
| 97 =head1 METHODS | |
| 98 | |
| 99 new | |
| 100 project | |
| 101 old_to_new | |
| 102 new_to_old | |
| 103 adaptor | |
| 104 external_source | |
| 105 old_assembly | |
| 106 new_assembly | |
| 107 merge_fragments | |
| 108 check_length | |
| 109 | |
| 110 =head1 RELATED MODULES | |
| 111 | |
| 112 The process of creating a whole genome alignment between two assemblies | |
| 113 (which is the basis for the use of the methods in this class) is done by | |
| 114 a series of scripts. Please see | |
| 115 | |
| 116 ensembl/misc-scripts/assembly/README | |
| 117 | |
| 118 for a high-level description of this process, and POD in the individual | |
| 119 scripts for the details. | |
| 120 | |
| 121 =cut | |
| 122 | |
| 123 package Bio::EnsEMBL::Utils::AssemblyProjector; | |
| 124 | |
| 125 use strict; | |
| 126 use warnings; | |
| 127 no warnings qw(uninitialized); | |
| 128 | |
| 129 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
| 130 use Bio::EnsEMBL::Utils::Argument qw(rearrange); | |
| 131 use Bio::EnsEMBL::Slice; | |
| 132 use Scalar::Util qw(weaken); | |
| 133 | |
| 134 =head2 new | |
| 135 | |
| 136 Arg [ADAPTOR] : Bio::EnsEMBL::DBSQL::DBAdaptor $adaptor - a db adaptor | |
| 137 for a database containing the assembly mapping | |
| 138 Arg [EXTERNAL_SOURCE] : (optional) Boolean $external_source - indicates if | |
| 139 source is from a different database | |
| 140 Arg [OLD_ASSEMBLY] : name of the old assembly | |
| 141 Arg [OLD_ASSEMBLY] : name of the new assembly | |
| 142 Arg [OBJECT_TYPE] : (optional) object type ('slice' or 'feature') | |
| 143 Arg [MERGE_FRAGMENTS] : (optional) Boolean - determines if segments are merged | |
| 144 to return a single object spanning all segments | |
| 145 (default: true) | |
| 146 Arg [CHECK_LENGTH] : (optional) Boolean - determines if projected objects | |
| 147 have to have same length as original (default: false) | |
| 148 Example : my $ap = Bio::EnsEMBL::Utils::AssemblyProjector->new( | |
| 149 -DBADAPTOR => $dba, | |
| 150 -OLD_ASSEMBLY => NCBIM36, | |
| 151 -NEW_ASSEMBLY => NCBIM37, | |
| 152 ); | |
| 153 Description : Constructor. | |
| 154 Return type : a Bio::EnsEMBL::Utils::AssemblyProjector object | |
| 155 Exceptions : thrown on missing arguments | |
| 156 thrown on invalid OBJECT_TYPE | |
| 157 Caller : general | |
| 158 Status : At Risk | |
| 159 : under development | |
| 160 | |
| 161 =cut | |
| 162 | |
| 163 sub new { | |
| 164 my $caller = shift; | |
| 165 my $class = ref($caller) || $caller; | |
| 166 | |
| 167 my ($adaptor, $external_source, $old_assembly, $new_assembly, | |
| 168 $merge_fragments, $check_length) = rearrange([qw(ADAPTOR EXTERNAL_SOURCE | |
| 169 OLD_ASSEMBLY NEW_ASSEMBLY MERGE_FRAGMENTS CHECK_LENGTH)], @_); | |
| 170 | |
| 171 unless ($adaptor and ref($adaptor) and | |
| 172 $adaptor->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) { | |
| 173 throw("You must provide a DBAdaptor to a database containing the assembly mapping."); | |
| 174 } | |
| 175 | |
| 176 unless ($old_assembly and $new_assembly) { | |
| 177 throw("You must provide an old and new assembly name."); | |
| 178 } | |
| 179 | |
| 180 my $self = {}; | |
| 181 bless ($self, $class); | |
| 182 | |
| 183 # initialise | |
| 184 $self->adaptor($adaptor); | |
| 185 $self->{'old_assembly'} = $old_assembly; | |
| 186 $self->{'new_assembly'} = $new_assembly; | |
| 187 | |
| 188 # by default, merge fragments | |
| 189 $self->{'merge_fragments'} = $merge_fragments || 1; | |
| 190 | |
| 191 # by default, do not check length | |
| 192 $self->{'check_length'} = $check_length || 0; | |
| 193 | |
| 194 # by default, features and slices are expected in same database as the | |
| 195 # assembly mapping | |
| 196 $self->{'external_source'} = $external_source || 0; | |
| 197 | |
| 198 return $self; | |
| 199 } | |
| 200 | |
| 201 | |
| 202 =head2 project | |
| 203 | |
| 204 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object - | |
| 205 the object to project | |
| 206 Arg[2] : String $to_assembly - assembly to project to | |
| 207 Example : my $new_slice = $assembly_projector->project($old_slice, | |
| 208 'NCBIM37'); | |
| 209 Description : Projects a Slice or Feature to the specified assembly. | |
| 210 | |
| 211 Several tests are performed on the result to discard unwanted | |
| 212 results. All projection segments have to be on the same | |
| 213 seq_region and strand. If -MERGE_FRAGMENTS is set, gaps will be | |
| 214 bridged by creating a single object from first_segment_start to | |
| 215 last_segment_end. If -CHECK_LENGTH is set, the projected object | |
| 216 will have to have the same length as the original. You can use | |
| 217 the last_status() method to find out what the result of some of | |
| 218 these rule tests were. Please see the comments in the code for | |
| 219 more details about these rules. | |
| 220 | |
| 221 The return value of this method will always be a single object, | |
| 222 or undef if the projection fails any of the rules. | |
| 223 | |
| 224 Note that when projecting features, only a "shallow" projection | |
| 225 is performed, i.e. attached features aren't projected | |
| 226 automatically! (e.g. if you project a gene, its transcripts will | |
| 227 have to be projected manually before storing the new gene) | |
| 228 Return type : same a Arg 1, or undef if projection fails any of the rules | |
| 229 Exceptions : thrown on invalid arguments | |
| 230 Caller : general, $self->old_to_new, $self->new_to_old | |
| 231 Status : At Risk | |
| 232 : under development | |
| 233 | |
| 234 =cut | |
| 235 | |
| 236 sub project { | |
| 237 my ($self, $object, $to_assembly) = @_; | |
| 238 | |
| 239 throw("Need an assembly version to project to.") unless ($to_assembly); | |
| 240 throw("Need an object to project.") unless ($object and ref($object)); | |
| 241 | |
| 242 my ($slice, $object_type); | |
| 243 | |
| 244 if ($object->isa('Bio::EnsEMBL::Feature')) { | |
| 245 $object_type = 'feature'; | |
| 246 } elsif ($object->isa('Bio::EnsEMBL::Slice')) { | |
| 247 $object_type = 'slice'; | |
| 248 } else { | |
| 249 throw("Need a Feature or Slice to project."); | |
| 250 } | |
| 251 | |
| 252 # if the feature or slice is sourced from another db, we have to "transfer" | |
| 253 # it to the db that contains the assembly mapping. the transfer is very | |
| 254 # shallow but that should do for our purposes | |
| 255 if ($self->external_source) { | |
| 256 my $slice_adaptor = $self->adaptor->get_SliceAdaptor; | |
| 257 | |
| 258 if ($object_type eq 'feature') { | |
| 259 | |
| 260 # createa a new slice from the target db | |
| 261 my $f_slice = $object->slice; | |
| 262 my $target_slice = $slice_adaptor->fetch_by_name($f_slice->name); | |
| 263 | |
| 264 # now change the feature so that it appears it's from the target db | |
| 265 $object->slice($target_slice); | |
| 266 | |
| 267 } else { | |
| 268 | |
| 269 # createa a new slice from the target db | |
| 270 $object = $slice_adaptor->fetch_by_name($object->name); | |
| 271 | |
| 272 } | |
| 273 } | |
| 274 | |
| 275 if ($object_type eq 'feature') { | |
| 276 $slice = $object->feature_Slice; | |
| 277 } else { | |
| 278 $slice = $object; | |
| 279 } | |
| 280 | |
| 281 # warn if trying to project to assembly version the object already is on | |
| 282 if ($slice->coord_system->version eq $to_assembly) { | |
| 283 warning("Assembly version to project to ($to_assembly) is the same as your object's assembly (".$slice->coord_system->version.")."); | |
| 284 } | |
| 285 | |
| 286 # now project the slice | |
| 287 my $cs_name = $slice->coord_system_name; | |
| 288 my @segments = @{ $slice->project($cs_name, $to_assembly) }; | |
| 289 | |
| 290 # we need to reverse the projection segment list if the orignial | |
| 291 if ($slice->strand == -1) { | |
| 292 @segments = reverse(@segments); | |
| 293 } | |
| 294 | |
| 295 # apply rules to projection results | |
| 296 # | |
| 297 # discard the projected feature/slice if | |
| 298 # 1. it doesn't project at all (no segments returned) | |
| 299 # 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more | |
| 300 # than one segment) | |
| 301 # 3. [if CHECK_LENGTH is set] the projection doesn't have the same length | |
| 302 # as the original feature/slice | |
| 303 # 4. all segments are on same chromosome and strand | |
| 304 | |
| 305 # keep track of the status of applied rules | |
| 306 my @status = (); | |
| 307 | |
| 308 # test (1) | |
| 309 return undef unless (@segments); | |
| 310 #warn "DEBUG: passed test 1\n"; | |
| 311 | |
| 312 # test (2) | |
| 313 return undef if (!($self->merge_fragments) and scalar(@segments) > 1); | |
| 314 push @status, 'fragmented' if (scalar(@segments) > 1); | |
| 315 #warn "DEBUG: passed test 2\n"; | |
| 316 | |
| 317 # test (3) | |
| 318 my $first_slice = $segments[0]->to_Slice; | |
| 319 my $last_slice = $segments[-1]->to_Slice; | |
| 320 my $length_mismatch = (($last_slice->end - $first_slice->start + 1) != | |
| 321 $object->length); | |
| 322 return undef if ($self->check_length and $length_mismatch); | |
| 323 push @status, 'length_mismatch' if ($length_mismatch); | |
| 324 #warn "DEBUG: passed test 3\n"; | |
| 325 | |
| 326 # test (4) | |
| 327 my %sr_names = (); | |
| 328 my %strands = (); | |
| 329 foreach my $seg (@segments) { | |
| 330 my $sl = $seg->to_Slice; | |
| 331 $sr_names{$sl->seq_region_name}++; | |
| 332 $strands{$sl->strand}++; | |
| 333 } | |
| 334 return undef if (scalar(keys %sr_names) > 1 or scalar(keys %strands) > 1); | |
| 335 #warn "DEBUG: passed test 4\n"; | |
| 336 | |
| 337 # remember rule status | |
| 338 $self->last_status(join('|', @status)); | |
| 339 | |
| 340 # everything looks fine, so adjust the coords of your feature/slice | |
| 341 my $new_slice = $first_slice; | |
| 342 $new_slice->{'end'} = $last_slice->end; | |
| 343 | |
| 344 if ($object_type eq 'slice') { | |
| 345 return $new_slice; | |
| 346 } else { | |
| 347 | |
| 348 $object->start($new_slice->start); | |
| 349 $object->end($new_slice->end); | |
| 350 $object->strand($new_slice->strand); | |
| 351 $object->slice($new_slice->seq_region_Slice); | |
| 352 | |
| 353 # undef dbID and adaptor so you can store the feature in the target db | |
| 354 $object->dbID(undef); | |
| 355 $object->adaptor(undef); | |
| 356 | |
| 357 return $object; | |
| 358 } | |
| 359 | |
| 360 } | |
| 361 | |
| 362 | |
| 363 =head2 old_to_new | |
| 364 | |
| 365 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object - | |
| 366 the object to project | |
| 367 Example : my $new_slice = $assembly_projector->old_to_new($old_slice); | |
| 368 Description : Projects a Slice or Feature from old to new assembly. | |
| 369 This method is just a convenience wrapper for $self->project. | |
| 370 Return type : same a Arg 1, or undef | |
| 371 Exceptions : none | |
| 372 Caller : general | |
| 373 Status : At Risk | |
| 374 : under development | |
| 375 | |
| 376 =cut | |
| 377 | |
| 378 sub old_to_new { | |
| 379 my ($self, $object) = @_; | |
| 380 return $self->project($object, $self->new_assembly); | |
| 381 } | |
| 382 | |
| 383 | |
| 384 =head2 new_to_old | |
| 385 | |
| 386 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object - | |
| 387 the object to project | |
| 388 Example : my $old_slice = $assembly_projector->new_to_old($new_slice, 1); | |
| 389 Description : Projects a Slice or Feature from new to old assembly. | |
| 390 This method is just a convenience wrapper for $self->project. | |
| 391 Return type : same a Arg 1, or undef | |
| 392 Exceptions : none | |
| 393 Caller : general | |
| 394 Status : At Risk | |
| 395 : under development | |
| 396 | |
| 397 =cut | |
| 398 | |
| 399 sub new_to_old { | |
| 400 my ($self, $object) = @_; | |
| 401 return $self->project($object, $self->old_assembly); | |
| 402 } | |
| 403 | |
| 404 | |
| 405 # | |
| 406 # accessors | |
| 407 # | |
| 408 sub adaptor { | |
| 409 my $self = shift; | |
| 410 weaken($self->{'adaptor'} = shift) if (@_); | |
| 411 return $self->{'adaptor'}; | |
| 412 } | |
| 413 | |
| 414 | |
| 415 sub external_source { | |
| 416 my $self = shift; | |
| 417 $self->{'external_source'} = shift if (@_); | |
| 418 return $self->{'external_source'}; | |
| 419 } | |
| 420 | |
| 421 | |
| 422 sub old_assembly { | |
| 423 my $self = shift; | |
| 424 $self->{'old_assembly'} = shift if (@_); | |
| 425 return $self->{'old_assembly'}; | |
| 426 } | |
| 427 | |
| 428 | |
| 429 sub new_assembly { | |
| 430 my $self = shift; | |
| 431 $self->{'new_assembly'} = shift if (@_); | |
| 432 return $self->{'new_assembly'}; | |
| 433 } | |
| 434 | |
| 435 | |
| 436 sub merge_fragments { | |
| 437 my $self = shift; | |
| 438 $self->{'merge_fragments'} = shift if (@_); | |
| 439 return $self->{'merge_fragments'}; | |
| 440 } | |
| 441 | |
| 442 | |
| 443 sub check_length { | |
| 444 my $self = shift; | |
| 445 $self->{'check_length'} = shift if (@_); | |
| 446 return $self->{'check_length'}; | |
| 447 } | |
| 448 | |
| 449 | |
| 450 sub last_status { | |
| 451 my $self = shift; | |
| 452 $self->{'last_status'} = shift if (@_); | |
| 453 return $self->{'last_status'}; | |
| 454 } | |
| 455 | |
| 456 | |
| 457 1; | |
| 458 | |
| 459 |
