Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,794 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::ResultAnalyser - analyse stable Id mapping results + +=head1 SYNOPSIS + + # get a result analyser + my $analyser = Bio::EnsEMBL::IdMapping::ResultAnalyser->new( + -LOGGER => $logger, + -CONF => $conf, + -CACHE => $cache + ); + + # analyse results + $analyser->analyse( $gene_mappings, + $stable_id_mapper->get_all_stable_id_events('similarity') ); + + # write results to file + $analyser->write_results_to_file; + + # create click lists + $analyser->create_clicklist; + + # mapping_summary + $analyser->create_mapping_summary; + +=head1 DESCRIPTION + +This is a utility module which analyses the stable Id mapping results +by providing various sorts of mapping statistics. It also creates +clicklists and a mapping summary. + +=head1 METHODS + + analyse + analyse_db + classify_source_genes_by_type + classify_genes_by_mapping_simple + classify_genes_by_mapping + add + get + get_all_by_subclass + get_all_by_class + get_count_by_subclass + get_count_by_class + get_all_classes + class_key + write_results_to_file + create_clicklist + create_mapping_summary + read_from_file + +=cut + + +package Bio::EnsEMBL::IdMapping::ResultAnalyser; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::BaseObject; +our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); + + +=head2 analyse + + Arg[1] : Bio::EnsEMBL::IdMapping::MappingList $gene_mappings - the gene + mappings to analyse + Arg[2] : Arrayref of Strings - similarity events + Example : $analyser->analyse($gene_mappings, + $stable_id_mapper->get_all_stable_id_events('similarity')); + Description : Analyses the results of a stable Id mapping run. + Return type : none + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub analyse { + my $self = shift; + my $gene_mappings = shift; + my $similarity_events = shift; + + # argument check + unless ($gene_mappings and + $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw("Need a Bio::EnsEMBL::IdMapping::MappingList of genes."); + } + + unless ($similarity_events and ref($similarity_events) eq 'ARRAY') { + throw("Need a list of similarity events."); + } + + # classify source genes by type (status-logic_name-biotype) + $self->classify_source_genes_by_type; + + # classify source genes by mapping status + $self->classify_genes_by_mapping($gene_mappings, $similarity_events); +} + + +=head2 classify_source_genes_by_type + + Example : $analyser->classify_source_genes_by_type; + Description : Classifies source genes by type and adds them to the internal + datastructure. For the format of the classification string see + class_key(). + Return type : none + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub classify_source_genes_by_type { + my $self = shift; + + foreach my $s_gene (values %{ $self->cache->get_by_name('genes_by_id', 'source') }) { + $self->add('source', $self->class_key($s_gene), 'all', $s_gene->stable_id); + } +} + + +=head2 classify_genes_by_mapping_simple + + Arg[1] : Bio::EnsEMBL::IdMapping::MapppingList $gene_mappings - gene + mappings to classify + Example : $analyser->classify_genes_by_mapping_simple; + Description : Classifies target genes by mapping ('mapped' or 'unmapped'). + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : This method is not in use at the momen. + Status : At Risk + : under development + +=cut + +sub classify_genes_by_mapping_simple { + my $self = shift; + my $gene_mappings = shift; + + # argument check + unless ($gene_mappings and + $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw("Need a Bio::EnsEMBL::IdMapping::MappingList of genes."); + } + + my %result = (); + + # firrst, create a lookup hash of source genes by target internal ID + my %source_genes_by_target = (); + foreach my $e (@{ $gene_mappings->get_all_Entries }) { + my $s_gene = $self->cache->get_by_key('genes_by_id', 'source', $e->source); + my $t_gene = $self->cache->get_by_key('genes_by_id', 'target', $e->target); + $source_genes_by_target{$t_gene->id} = $s_gene; + } + + # now loop over target genes + foreach my $t_gene (values %{ $self->cache->get_by_name('genes_by_id', 'target') }) { + + # check if target gene has all required properties set + unless ($t_gene->status and $t_gene->logic_name and $t_gene->biotype) { + $self->logger->warning("Missing data for target gene: ". + $t_gene->to_string."\n", 1); + } + + my $class = $self->class_key($t_gene); + + # classify as '1' if mapped (using source gene's stable ID), otherwise '0' + if (my $s_gene = $source_genes_by_target{$t_gene->id}) { + $self->add('target', $class, 'mapped', $s_gene->stable_id); + } else { + $self->add('target', $class, 'unmapped', $t_gene->stable_id); + } + + } +} + + +=head2 classify_genes_by_mapping + + Arg[1] : Bio::EnsEMBL::IdMapping::MapppingList $gene_mappings - gene + mappings to classify + Arg[2] : Arrayref of Strings - similarity events + Example : $analyser->classify_genes_by_mapping; + Description : Classifies genes by mapping. Status is + 'mapped' => stable Id was mapped + 'lost_similar' => stable Id not mapped, but there is a + similarity entry for the source Id + 'lost_definite' => not mapped and no similarity + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : This method is not in use at the momen. + Status : At Risk + : under development + +=cut + +sub classify_genes_by_mapping { + my $self = shift; + my $gene_mappings = shift; + my $similarity_events = shift; + + # argument check + unless ($gene_mappings and + $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw("Need a Bio::EnsEMBL::IdMapping::MappingList of genes."); + } + + unless ($similarity_events and ref($similarity_events) eq 'ARRAY') { + throw("Need a list of similarity events."); + } + + # mapped genes + foreach my $e (@{ $gene_mappings->get_all_Entries }) { + my $s_gene = $self->cache->get_by_key('genes_by_id', 'source', $e->source); + $self->add('source', $self->class_key($s_gene), 'mapped', + $s_gene->stable_id); + } + + # lookup hash for similarities + my %similar = (); + foreach my $event (@{ $similarity_events }) { + my ($stable_id) = split("\t", $event); + $similar{$stable_id} = 1; + } + + # deleted genes + foreach my $s_gene (values %{ $self->cache->get_by_name('genes_by_id', 'source') }) { + + my $stable_id = $s_gene->stable_id; + my $class = $self->class_key($s_gene); + + unless ($self->get('source', $class, 'mapped', $stable_id)) { + + # sub-classify as 'lost_similar' or 'lost_definite' + if ($similar{$stable_id}) { + $self->add('source', $class, 'lost_similar', $stable_id); + } else { + $self->add('source', $class, 'lost_definite', $stable_id); + } + + } + } + +} + + +=head2 add + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Arg[4] : String $stable_id - gene stable Id + Arg[5] : String $val - value (usually 0 or 1) + Example : $analyser->add('source', 'KNOWN-ensembl-protein_coding', + 'mapped', 'ENSG00002342', 1); + Description : Add a stable Id / property pair to a name/dbtype lookup hash. + + The datastructure is a bit of a bloat, but is general enough to + be used as a lookup hash and to generate statistics (counts by + type) and debug lists (dump by type). + Return type : String - the added value + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub add { + my ($self, $dbtype, $class, $subclass, $stable_id, $val) = @_; + + # private method, so no argument check done for performance reasons + + # default to a value of '1' + $val = 1 unless (defined($val)); + + $self->{$dbtype}->{$class}->{$subclass}->{$stable_id} = $val; +} + + +=head2 get + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Arg[4] : String $stable_id - gene stable Id + Example : my $mapping_status = $analyser->get('source', + 'KNOWN-ensembl-protein_coding', 'mapped', 'ENSG00002342'); + Description : Gets a stable Id mapping status from the internal datastructure. + Return type : String + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub get { + my ($self, $dbtype, $class, $subclass, $stable_id) = @_; + + # private method, so no argument check done for performance reasons + + return $self->{$dbtype}->{$class}->{$subclass}->{$stable_id}; +} + + +=head2 get_all_by_subclass + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Example : my @mapped_stable_ids = @{ + $analyser->get_all_by_subclass( + 'source', 'KNOWN-ensembl-protein_coding', + 'mapped' + ) }; + Description : Gets a list of stable Id for a given subclass. + Return type : Arrayref of String (stable Ids) + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub get_all_by_subclass { + my ($self, $dbtype, $class, $subclass) = @_; + + # argument check + throw("Need a dbtype (source|target).") unless ($dbtype); + throw("Need a class.") unless ($class); + throw("Need a subclass.") unless ($subclass); + + return [ keys %{ $self->{$dbtype}->{$class}->{$subclass} || {} } ]; +} + + +=head2 get_all_by_class + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Example : my @stable_ids = @{ + $analyser->get_all_by_class( 'source', + 'KNOWN-ensembl-protein_coding' ) }; + Description : Gets a list of stable Id for a given class. + Return type : Arrayref of String (stable Ids) + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub get_all_by_class { + my ($self, $dbtype, $class) = @_; + + # argument check + throw("Need a dbtype (source|target).") unless ($dbtype); + throw("Need a class.") unless ($class); + + my %merged = (); + + foreach my $subclass (keys %{ $self->{$dbtype}->{$class} || {} }) { + while (my ($key, $val) = each(%{ $self->{$dbtype}->{$class}->{$subclass} })) { + $merged{$key} = $val; + } + } + + return [ keys %merged ]; +} + + +=head2 get_count_by_subclass + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Example : my $num_mapped = $analyser->get_count_by_subclass('source', + 'KNOWN-ensembl-protein_coding', 'mapped'); + Description : Gets the number of stable Ids for a given subclass. + Return type : Int + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub get_count_by_subclass { + my ($self, $dbtype, $class, $subclass) = @_; + + # argument check + throw("Need a dbtype (source|target).") unless ($dbtype); + throw("Need a class.") unless ($class); + throw("Need a subclass.") unless ($subclass); + + return scalar(keys %{ $self->{$dbtype}->{$class}->{$subclass} || {} }); +} + + +=head2 get_count_by_class + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Example : my $num_mapped = $analyser->get_count_by_class('source', + 'KNOWN-ensembl-protein_coding'); + Description : Gets the number of stable Ids for a given class. + Return type : Int + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub get_count_by_class { + my ($self, $dbtype, $class) = @_; + + # argument check + throw("Need a dbtype (source|target).") unless ($dbtype); + throw("Need a class.") unless ($class); + + return scalar(@{ $self->get_all_by_class($dbtype, $class) }); +} + + +=head2 get_all_classes + + Arg[1] : String $dbtype - db type ('source' or 'target') + Example : foreach my $class (@{ $analyser->get_all_classes('source') }) { + print "$class\n"; + } + Description : Gets a list of classes in the ResultAnalyser. + Return type : Arrayref of String + Exceptions : thrown on missing argument + Caller : internal + Status : At Risk + : under development + +=cut + +sub get_all_classes { + my ($self, $dbtype) = @_; + + # argument check + throw("Need a dbtype (source|target).") unless ($dbtype); + + return [ sort keys %{ $self->{$dbtype} || {} } ]; +} + + +=head2 class_key + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - a gene object + Example : my $class = $analyser->class_key($gene); + Description : Generates a key identifying a gene class. This identifier is + composed from the gene's status, logic naame, and biotye. + Return type : String + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub class_key { + my ($self, $gene) = @_; + return join('-', map { $gene->$_ } qw(status logic_name biotype)); +} + + +=head2 write_results_to_file + + Example : $analyser->write_results_to_file; + Description : Writes the results of the result analysis to a file. This is a + human-readable text detailing the mapping statistics. + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub write_results_to_file { + my $self = shift; + + my $fh = $self->get_filehandle('gene_detailed_mapping_stats.txt', 'stats'); + + my $fmt1 = "%-60s%-16s%-16s%-16s\n"; + my $fmt2 = "%-60s%5.0f (%7s) %5.0f (%7s) %5.0f (%7s)\n"; + my $fmt3 = "%3.2f%%"; + + print $fh "Gene detailed mapping results:\n\n"; + + print $fh sprintf($fmt1, "Gene type", "mapped", "lost (similar)", + "lost (definite)"); + + print $fh ('-'x108), "\n"; + + foreach my $class (@{ $self->get_all_classes('source') }) { + next if ($class eq 'all'); + + my $total = $self->get_count_by_class('source', $class); + + # avoid division by zero error + unless ($total) { + $self->logger->warning("No count found for $class.\n", 1); + next; + } + + my $mapped = $self->get_count_by_subclass('source', $class, 'mapped'); + my $similar = $self->get_count_by_subclass('source', $class, + 'lost_similar'); + my $lost = $self->get_count_by_subclass('source', $class, 'lost_definite'); + + print $fh sprintf($fmt2, + $class, + $mapped, sprintf($fmt3, $mapped/$total*100), + $similar, sprintf($fmt3, $similar/$total*100), + $lost, sprintf($fmt3, $lost/$total*100)); + } + + close($fh); +} + + +=head2 create_clicklist + + Example : $analyser->create_clicklist; + Description : Writes an html file which contains a list of all lost genes, + with hyperlinks to the appropriate archive website. This is to + manually check lost genes. + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub create_clicklist { + my $self = shift; + + my $fh = $self->get_filehandle('genes_lost.html', 'stats'); + + # start html output + print $fh qq(<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n); + print $fh qq(<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-gb" lang="en-gb">); + print $fh "<head>\n"; + print $fh "<title>Lost genes "; + print $fh $self->conf->param('sourcedbname'), ' -> ', + $self->conf->param('targetdbname'); + print $fh "</title>\n"; + print $fh "</head>\n<body>\n"; + + my $prefix = $self->conf->param('urlprefix'); + unless ($prefix) { + $self->logger->warning("No urlprefix set, clicklists might not be useable.\n", 1); + } + + my $navigation; + my $clicklist; + + foreach my $class (@{ $self->get_all_classes('source') }) { + next if ($class eq 'all'); + + $navigation .= "$class "; + $clicklist .= "<h1>$class</h1>\n"; + + foreach my $subclass (qw(lost_similar lost_definite)) { + + # navigation + $navigation .= qq(<a href="#${class}-$subclass">$subclass</a> ); + + # clicklist + $clicklist .= "<h2>$subclass</h2>\n"; + + foreach my $stable_id (@{ $self->get_all_by_subclass('source', $class, $subclass) }) { + $clicklist .= qq(<a href="${prefix}$stable_id">$stable_id</a><br />\n); + } + + } + + $navigation .= "<br />\n"; + } + + # print navigation and clicklist + print $fh "$navigation\n\n"; + print $fh "$clicklist\n\n"; + + # html footer + print $fh "</body></html>\n"; + + close($fh); +} + + +=head2 create_mapping_summary + + Example : $analyser->create_mapping_summary(); + Description : Writes a text file containing a summary of the mapping stats. + This will be emailed to the genebuilder for evaluation (you will + have to manually send the email, using the text in + "mapping_summary.txt" as the template). + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub create_mapping_summary { + my $self = shift; + + my $fh = $self->get_filehandle('mapping_summary.txt'); + + # + # title + # + print $fh qq(Stable ID mapping results\n); + print $fh qq(=========================\n\n); + + # + # timing + # + print $fh "Run at: ".localtime()."\n"; + print $fh "Runtime: "; + print $fh $self->logger->runtime, "\n\n"; + + # + # parameters used for this run + # + print $fh $self->conf->list_param_values; + print $fh "\n"; + + # + # mapping stats + # + foreach my $type (qw(exon transcript translation gene gene_detailed)) { + my $filename = "${type}_mapping_stats.txt"; + + if ($self->file_exists($filename, 'stats')) { + print $fh $self->read_from_file($filename, 'stats'); + print $fh "\n\n"; + } else { + print $fh "No mapping stats found for $type.\n\n"; + } + } + + # + # db uploads + # + my @uploads = ( + ['stable_ids' => 'Stable IDs'], + ['events' => 'Stable ID events and mapping session'], + ['archive' => 'Gene and peptide archive'], + ); + + my $fmt1 = "%-40s%-20s\n"; + + print $fh qq(Data uploaded to db:\n); + print $fh qq(====================\n\n); + + if ($self->conf->param('dry_run')) { + + print $fh "None (dry run).\n"; + + } else { + + foreach my $u (@uploads) { + my $uploaded = 'no'; + $uploaded = 'yes' if ($self->conf->is_true("upload_".$u->[0])); + print $fh sprintf($fmt1, $u->[1], $uploaded); + } + + } + + print $fh "\n"; + + # + # stats and clicklist + # + my @output = ( + ['stats' => 'statistics (including clicklists of deleted IDs)'], + ['debug' => 'detailed mapping output for debugging'], + ['tables' => 'data files for db upload'], + ); + + my $fmt2 = "%-20s%-50s\n"; + + print $fh qq(\nOutput directories:\n); + print $fh qq(===================\n\n); + + print $fh sprintf($fmt2, qw(DIRECTORY DESCRIPTION)); + print $fh ('-'x72), "\n"; + + print $fh sprintf($fmt2, 'basedir', $self->conf->param('basedir')); + + foreach my $o (@output) { + print $fh sprintf($fmt2, '$basedir/'.$o->[0], $o->[1]); + } + + print $fh "\n"; + + # + # clicklist of first 10 deleted genes + # + print $fh qq(\nFirst 10 deleted known genes:\n); + print $fh qq(=============================\n\n); + + my $in_fh = $self->get_filehandle('genes_lost.txt', 'debug', '<'); + my $prefix = $self->conf->param('urlprefix'); + my $i; + + while (<$in_fh>) { + last if (++$i > 10); + + chomp; + my ($stable_id, $type) = split(/\s+/); + + next unless ($type eq 'known'); + + print $fh sprintf($fmt2, $stable_id, "${prefix}$stable_id"); + } + + close($in_fh); + close($fh); +} + + +=head2 read_from_file + + Arg[1] : String $filename - name of file to read + Arg[2] : (optional) String $append - directory name to append to basedir + Example : my $stats_text = $analyser->read_from_file('gene_mapping_stats', + 'stats'); + Description : Reads mapping stats from a file. + Return type : String + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub read_from_file { + my $self = shift; + my $filename = shift; + my $append = shift; + + my $in_fh = $self->get_filehandle($filename, $append, '<'); + + my $txt; + + while (<$in_fh>) { + $txt .= $_; + } + + return $txt; +} + +1; +