Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 4 Genome Research Limited. All rights reserved. | |
| 5 | |
| 6 This software is distributed under a modified Apache license. | |
| 7 For license details, please see | |
| 8 | |
| 9 http://www.ensembl.org/info/about/code_licence.html | |
| 10 | |
| 11 =head1 CONTACT | |
| 12 | |
| 13 Please email comments or questions to the public Ensembl | |
| 14 developers list at <dev@ensembl.org>. | |
| 15 | |
| 16 Questions may also be sent to the Ensembl help desk at | |
| 17 <helpdesk@ensembl.org>. | |
| 18 | |
| 19 =cut | |
| 20 | |
| 21 =head1 NAME | |
| 22 | |
| 23 =head1 SYNOPSIS | |
| 24 | |
| 25 =head1 DESCRIPTION | |
| 26 | |
| 27 =head1 METHODS | |
| 28 | |
| 29 =cut | |
| 30 | |
| 31 | |
| 32 package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric; | |
| 33 | |
| 34 use strict; | |
| 35 use warnings; | |
| 36 no warnings 'uninitialized'; | |
| 37 | |
| 38 use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper; | |
| 39 our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper); | |
| 40 | |
| 41 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
| 42 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); | |
| 43 | |
| 44 | |
| 45 # | |
| 46 # basic mapping | |
| 47 # | |
| 48 sub init_basic { | |
| 49 my $self = shift; | |
| 50 my $num = shift; | |
| 51 my $tsb = shift; | |
| 52 my $mappings = shift; | |
| 53 my $transcript_scores = shift; | |
| 54 | |
| 55 $self->logger->info("Basic transcript mapping...\n", 0, 'stamped'); | |
| 56 | |
| 57 $mappings = $self->basic_mapping($transcript_scores, | |
| 58 "transcript_mappings$num"); | |
| 59 $num++; | |
| 60 my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, | |
| 61 "transcript_matrix$num"); | |
| 62 | |
| 63 return ($new_scores, $mappings); | |
| 64 } | |
| 65 | |
| 66 | |
| 67 # | |
| 68 # handle cases with exact match but different translation | |
| 69 # | |
| 70 sub non_exact_translation { | |
| 71 my $self = shift; | |
| 72 my $num = shift; | |
| 73 my $tsb = shift; | |
| 74 my $mappings = shift; | |
| 75 my $transcript_scores = shift; | |
| 76 | |
| 77 $self->logger->info("Exact Transcript non-exact Translation...\n", 0, 'stamped'); | |
| 78 | |
| 79 unless ($transcript_scores->loaded) { | |
| 80 $tsb->different_translation_rescore($transcript_scores); | |
| 81 $transcript_scores->write_to_file; | |
| 82 } | |
| 83 | |
| 84 $mappings = $self->basic_mapping($transcript_scores, | |
| 85 "transcript_mappings$num"); | |
| 86 $num++; | |
| 87 my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, | |
| 88 "transcript_matrix$num"); | |
| 89 | |
| 90 return ($new_scores, $mappings); | |
| 91 } | |
| 92 | |
| 93 | |
| 94 # | |
| 95 # reduce score for mappings of transcripts which do not belong to mapped | |
| 96 # genes | |
| 97 # | |
| 98 sub mapped_gene { | |
| 99 my $self = shift; | |
| 100 my $num = shift; | |
| 101 my $tsb = shift; | |
| 102 my $mappings = shift; | |
| 103 my $transcript_scores = shift; | |
| 104 my $gene_mappings = shift; | |
| 105 | |
| 106 $self->logger->info("Transcripts in mapped genes...\n", 0, 'stamped'); | |
| 107 | |
| 108 unless ($transcript_scores->loaded) { | |
| 109 $tsb->non_mapped_gene_rescore($transcript_scores, $gene_mappings); | |
| 110 $transcript_scores->write_to_file; | |
| 111 } | |
| 112 | |
| 113 $mappings = $self->basic_mapping($transcript_scores, | |
| 114 "transcript_mappings$num"); | |
| 115 $num++; | |
| 116 my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, | |
| 117 "transcript_matrix$num"); | |
| 118 | |
| 119 return ($new_scores, $mappings); | |
| 120 } | |
| 121 | |
| 122 # | |
| 123 # rescore by penalising scores between transcripts with different biotypes | |
| 124 # | |
| 125 sub biotype { | |
| 126 my $self = shift; | |
| 127 my $num = shift; | |
| 128 my $tsb = shift; | |
| 129 my $mappings = shift; | |
| 130 my $transcript_scores = shift; | |
| 131 | |
| 132 $self->logger->info( "Retry with biotype disambiguation...\n", | |
| 133 0, 'stamped' ); | |
| 134 | |
| 135 unless ( $transcript_scores->loaded() ) { | |
| 136 $tsb->biotype_transcript_rescore($transcript_scores); | |
| 137 $transcript_scores->write_to_file(); | |
| 138 } | |
| 139 | |
| 140 my $new_mappings = $self->basic_mapping( $transcript_scores, | |
| 141 "transcript_mappings$num" ); | |
| 142 $num++; | |
| 143 my $new_scores = | |
| 144 $tsb->create_shrinked_matrix( $transcript_scores, $new_mappings, | |
| 145 "transcript_matrix$num" ); | |
| 146 | |
| 147 return ( $new_scores, $new_mappings ); | |
| 148 } | |
| 149 | |
| 150 # | |
| 151 # selectively rescore by penalising scores between transcripts with | |
| 152 # different internalIDs | |
| 153 # | |
| 154 sub internal_id { | |
| 155 my $self = shift; | |
| 156 my $num = shift; | |
| 157 my $tsb = shift; | |
| 158 my $mappings = shift; | |
| 159 my $transcript_scores = shift; | |
| 160 | |
| 161 $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped'); | |
| 162 | |
| 163 unless ($transcript_scores->loaded) { | |
| 164 $tsb->internal_id_rescore($transcript_scores); | |
| 165 $transcript_scores->write_to_file; | |
| 166 } | |
| 167 | |
| 168 $mappings = $self->basic_mapping($transcript_scores, | |
| 169 "transcript_mappings$num"); | |
| 170 $num++; | |
| 171 my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, | |
| 172 "transcript_matrix$num"); | |
| 173 | |
| 174 return ($new_scores, $mappings); | |
| 175 } | |
| 176 | |
| 177 | |
| 178 # | |
| 179 # handle ambiguities between transcripts in single genes | |
| 180 # | |
| 181 sub single_gene { | |
| 182 my $self = shift; | |
| 183 my $num = shift; | |
| 184 my $tsb = shift; | |
| 185 my $mappings = shift; | |
| 186 my $transcript_scores = shift; | |
| 187 | |
| 188 $self->logger->info("Transcripts in single genes...\n", 0, 'stamped'); | |
| 189 | |
| 190 unless ($transcript_scores->loaded) { | |
| 191 $transcript_scores->write_to_file; | |
| 192 } | |
| 193 | |
| 194 $mappings = $self->same_gene_transcript_mapping($transcript_scores, | |
| 195 "transcript_mappings$num"); | |
| 196 $num++; | |
| 197 my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, | |
| 198 "transcript_matrix$num"); | |
| 199 | |
| 200 return ($new_scores, $mappings); | |
| 201 } | |
| 202 | |
| 203 | |
| 204 # | |
| 205 # modified basic mapper that maps transcripts that are ambiguous within one gene | |
| 206 # | |
| 207 sub same_gene_transcript_mapping { | |
| 208 my $self = shift; | |
| 209 my $matrix = shift; | |
| 210 my $mapping_name = shift; | |
| 211 | |
| 212 # argument checks | |
| 213 unless ($matrix and | |
| 214 $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { | |
| 215 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); | |
| 216 } | |
| 217 | |
| 218 throw('Need a name for serialising the mapping.') unless ($mapping_name); | |
| 219 | |
| 220 # Create a new MappingList object. Specify AUTO_LOAD to load serialised | |
| 221 # existing mappings if found | |
| 222 my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); | |
| 223 | |
| 224 my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( | |
| 225 -DUMP_PATH => $dump_path, | |
| 226 -CACHE_FILE => "${mapping_name}.ser", | |
| 227 -AUTO_LOAD => 1, | |
| 228 ); | |
| 229 | |
| 230 # checkpoint test: return a previously stored MappingList | |
| 231 if ($mappings->loaded) { | |
| 232 $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n"); | |
| 233 return $mappings; | |
| 234 } | |
| 235 | |
| 236 my $sources_done = {}; | |
| 237 my $targets_done = {}; | |
| 238 | |
| 239 # sort scoring matrix entries by descending score | |
| 240 my @sorted_entries = sort { $b->score <=> $a->score || | |
| 241 $a->source <=> $b->source || $a->target <=> $b->target } | |
| 242 @{ $matrix->get_all_Entries }; | |
| 243 | |
| 244 while (my $entry = shift(@sorted_entries)) { | |
| 245 | |
| 246 # $self->logger->debug("\nxxx4 ".$entry->to_string." "); | |
| 247 | |
| 248 # we already found a mapping for either source or target yet | |
| 249 next if ($sources_done->{$entry->source} or | |
| 250 $targets_done->{$entry->target}); | |
| 251 | |
| 252 #$self->logger->debug('d'); | |
| 253 | |
| 254 my $other_sources = []; | |
| 255 my $other_targets = []; | |
| 256 my %source_genes = (); | |
| 257 my %target_genes = (); | |
| 258 | |
| 259 if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) { | |
| 260 #$self->logger->debug('a'); | |
| 261 | |
| 262 $other_sources = $self->filter_sources($other_sources, $sources_done); | |
| 263 $other_targets = $self->filter_targets($other_targets, $targets_done); | |
| 264 | |
| 265 $source_genes{$self->cache->get_by_key('genes_by_transcript_id', | |
| 266 'source', $entry->source)} = 1; | |
| 267 $target_genes{$self->cache->get_by_key('genes_by_transcript_id', | |
| 268 'target', $entry->target)} = 1; | |
| 269 | |
| 270 foreach my $other_source (@{ $other_sources }) { | |
| 271 $source_genes{$self->cache->get_by_key('genes_by_transcript_id', | |
| 272 'source', $other_source)} = 1; | |
| 273 } | |
| 274 | |
| 275 foreach my $other_target (@{ $other_targets }) { | |
| 276 $target_genes{$self->cache->get_by_key('genes_by_transcript_id', | |
| 277 'target', $other_target)} = 1; | |
| 278 } | |
| 279 | |
| 280 # only add mapping if only one source and target gene involved | |
| 281 if (scalar(keys %source_genes) == 1 and scalar(keys %target_genes) == 1) { | |
| 282 #$self->logger->debug('O'); | |
| 283 $mappings->add_Entry($entry); | |
| 284 } | |
| 285 | |
| 286 } else { | |
| 287 #$self->logger->debug('A'); | |
| 288 | |
| 289 # this is the best mapping, add it | |
| 290 $mappings->add_Entry($entry); | |
| 291 } | |
| 292 | |
| 293 $sources_done->{$entry->source} = 1; | |
| 294 $targets_done->{$entry->target} = 1; | |
| 295 } | |
| 296 | |
| 297 # create checkpoint | |
| 298 $mappings->write_to_file; | |
| 299 | |
| 300 return $mappings; | |
| 301 } | |
| 302 | |
| 303 | |
| 304 1; | |
| 305 |
