Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/Archiver.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/Archiver.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,334 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::IdMapping::Archiver - create gene_archive and peptide_archive
+
+=head1 SYNOPSIS
+
+  my $archiver = Bio::EnsEMBL::IdMapping::Archiver->new(
+    -LOGGER => $logger,
+    -CONF   => $conf,
+    -CACHE  => $cache
+  );
+
+  # create gene and peptide archive
+  $archiver->create_archive($mapping_session_id);
+
+  # dump existing archive tables to file
+  my $num_entries =
+    $archiver->dump_table_to_file( 'source', 'gene_archive',
+    'gene_archive_existing.txt', 1 );
+
+=head1 DESCRIPTION
+
+This module creates the gene_archive and peptide_archive
+tables. Data is written to a file as tab-delimited text for
+loading into a MySQL database (this can be done manually, or using
+StableIdmapper->upload_file_into_table()).
+
+An archive entry for a given source gene is created if no target
+gene exists, or if any of its transcripts or their translations
+changed. Non-coding transcripts only have an entry in gene_archive (i.e.
+without a corresponding peptide_archive entry).
+
+=head1 METHODS
+
+  create_archive
+  dump_gene
+  dump_tuple
+  dump_nc_row
+  mapping_session_id
+
+=cut
+
+
+package Bio::EnsEMBL::IdMapping::Archiver;
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use Bio::EnsEMBL::IdMapping::BaseObject;
+our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
+use Digest::MD5 qw(md5_hex);
+
+
+# instance variables
+my $pa_id;
+
+
+=head2 create_archive
+
+  Arg[1]      : Int $mapping_session_id - the mapping_session_id for this run
+  Example     : $archiver->create_archive($stable_id_mapper->mapping_session_id);
+  Description : Creates the gene_archive and peptide_archive tables and writes
+                the data to a tab-delimited file. The decision as to what to
+                archive is deferred to dump_gene(), see documentation there for
+                details.
+  Return type : none
+  Exceptions  : Thrown on missing argument.
+  Caller      : id_mapping.pl
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub create_archive {
+  my $self = shift;
+  my $mapping_session_id = shift;
+
+  # argument check
+  unless ($mapping_session_id) {
+    $self->logger->warning("No mapping_session_id set.");
+  }
+
+  $self->mapping_session_id($mapping_session_id);
+
+  # get filehandles to write gene and peptide archive
+  my $ga_fh = $self->get_filehandle('gene_archive_new.txt', 'tables');
+  my $pa_fh = $self->get_filehandle('peptide_archive_new.txt', 'tables');
+
+  # get the currently highest peptide_archive_id from the source db
+  my $s_dba = $self->cache->get_DBAdaptor('source');
+  my $s_dbh = $s_dba->dbc->db_handle;
+  my $sql = qq(SELECT MAX(peptide_archive_id) FROM peptide_archive);
+  $pa_id = $self->fetch_value_from_db($s_dbh, $sql);
+
+  unless ($pa_id) {
+    $self->logger->warning("No max(peptide_archive_id) found in db.\n", 1);
+    $self->logger->info("That's ok if this is the first stable ID mapping for this species.\n", 1);
+  }
+  
+  $pa_id++;
+  $self->logger->debug("Starting with peptide_archive_id $pa_id.\n");
+
+  # lookup hash of target gene stable IDs
+  my %target_genes = map { $_->stable_id => $_ }
+    values %{ $self->cache->get_by_name("genes_by_id", 'target') };
+
+  # loop over source genes and dump to archive (dump_gene() will decide whether
+  # to do full or partial dump)
+  foreach my $source_gene (values %{ $self->cache->get_by_name("genes_by_id",
+    'source') }) {
+
+    $self->dump_gene($source_gene, $target_genes{$source_gene->stable_id}, 
+      $ga_fh, $pa_fh);
+  }
+  
+  close($ga_fh);
+  close($pa_fh);
+}
+
+
+=head2 dump_gene
+
+  Arg[1]      : Bio::EnsEMBL::IdMapping::TinyGene $s_gene - source gene
+  Arg[2]      : Bio::EnsEMBL::IdMapping::TinyGene $t_gene - target gene
+  Arg[3]      : Filehandle $ga_fh - filehandle for writing gene_archive data
+  Arg[4]      : Filehandle $pa_fh - filehandle for writing peptide_archive data
+  Example     : my $target_gene = $gene_mappings{$source_gene->stable_id};
+                $archiver->dump_gene($source_gene, $target_gene, $ga_fh, $pa_fh);
+  Description : Given a source gene, it will write a gene_achive and
+                peptide_achive entry for it if no target gene exists, or if any
+                of its transcripts or their translation changed. 
+  Return type : none
+  Exceptions  : none
+  Caller      : create_archive()
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub dump_gene {
+  my ($self, $s_gene, $t_gene, $ga_fh, $pa_fh) = @_;
+
+  # private method, so no argument check done for performance reasons
+  
+  # deal with ncRNA differently
+  # hope this simple biotype regex is accurate enough...
+  my $is_ncRNA = 0;
+  $is_ncRNA = 1 if ($s_gene->biotype =~ /RNA/);
+
+  # loop over source transcripts
+  foreach my $s_tr (@{ $s_gene->get_all_Transcripts }) {
+    my $s_tl = $s_tr->translation;
+
+    # we got a coding transcript
+    if ($s_tl) {
+      
+      # do a full dump of this gene if no target gene exists
+      if (! $t_gene) {
+        $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
+
+      # otherwise, only dump if any transcript or its translation changed
+      } else {
+
+        my $changed_flag = 1;
+      
+        foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) {
+          my $t_tl = $t_tr->translation;
+          next unless ($t_tl);
+
+          if (($s_tr->stable_id eq $t_tr->stable_id) and
+              ($s_tl->stable_id eq $t_tl->stable_id) and
+              ($s_tl->seq eq $t_tl->seq)) {
+            $changed_flag = 0;
+          }
+        }
+
+        if ($changed_flag) {
+          $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
+        }
+      }
+
+    # now deal with ncRNAs (they don't translate but we still want to archive
+    # them)
+    } elsif ($is_ncRNA) {
+    
+      if (! $t_gene) {
+    
+        $self->dump_nc_row($s_gene, $s_tr, $ga_fh);
+    
+      } else {
+        
+        my $changed_flag = 1;
+      
+        foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) {
+          $changed_flag = 0 if ($s_tr->stable_id eq $t_tr->stable_id);
+        }
+
+        if ($changed_flag) {
+          $self->dump_nc_row($s_gene, $s_tr, $ga_fh);
+        }
+      
+      }
+    }
+  }
+}
+
+
+=head2 dump_tuple
+
+  Arg[1]      : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive
+  Arg[2]      : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript
+  Arg[3]      : Bio::EnsEMBL::IdMapping::TinyTranslation $tl - its translation
+  Arg[4]      : Filehandle $ga_fh - filehandle for writing gene_archive data
+  Arg[5]      : Filehandle $pa_fh - filehandle for writing peptide_archive data
+  Example     : $archive->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
+  Description : Writes entry lines for gene_archive and peptide_archive.
+  Return type : none
+  Exceptions  : none
+  Caller      : dump_gene()
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub dump_tuple {
+  my ($self, $gene, $tr, $tl, $ga_fh, $pa_fh) = @_;
+  
+  # private method, so no argument check done for performance reasons
+
+  # gene archive
+  print $ga_fh join("\t",
+                    $gene->stable_id,
+                    $gene->version,
+                    $tr->stable_id,
+                    $tr->version,
+                    $tl->stable_id,
+                    $tl->version,
+                    $pa_id,
+                    $self->mapping_session_id
+  );
+  print $ga_fh "\n";
+
+  # peptide archive
+  my $pep_seq = $tl->seq;
+  print $pa_fh join("\t", $pa_id, md5_hex($pep_seq), $pep_seq);
+  print $pa_fh "\n";
+
+  # increment peptide_archive_id
+  $pa_id++;
+}
+
+
+=head2 dump_nc_row
+
+  Arg[1]      : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive
+  Arg[2]      : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript
+  Arg[3]      : Filehandle $ga_fh - filehandle for writing gene_archive data
+  Example     : $archive->dump_nc_row($s_gene, $s_tr, $ga_fh);
+  Description : Writes an entry line for gene_archive for non-coding
+                transcripts.
+  Return type : none
+  Exceptions  : none
+  Caller      : dump_gene()
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub dump_nc_row {
+  my ($self, $gene, $tr, $ga_fh) = @_;
+  
+  # private method, so no argument check done for performance reasons
+
+  # gene archive
+  print $ga_fh join("\t",
+                    $gene->stable_id,
+                    $gene->version,
+                    $tr->stable_id,
+                    $tr->version,
+                    '\N',
+                    '\N',
+                    '\N',
+                    $self->mapping_session_id
+  );
+  print $ga_fh "\n";
+}
+
+
+=head2 mapping_session_id
+
+  Arg[1]      : (optional) Int - mapping_session_id to set
+  Example     : my $msi = $archiver->mapping_session_id;
+  Description : Getter/setter for mapping_session_id.
+  Return type : Int
+  Exceptions  : none
+  Caller      : create_archive()
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub mapping_session_id {
+  my $self = shift;
+  $self->{'_mapping_session_id'} = shift if (@_);
+  return $self->{'_mapping_session_id'};
+}
+
+
+1;
+
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children