Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Transcript.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Transcript.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,307 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+=head1 METHODS
+
+=cut
+
+package Bio::EnsEMBL::Utils::VegaCuration::Transcript;
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+use vars qw(@ISA);
+
+use Bio::EnsEMBL::Utils::VegaCuration::Gene;
+use Data::Dumper;
+
+@ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Gene);
+
+
+=head2 find_non_overlaps
+
+   Args       : arrayref of B::E::Transcripts
+   Example    : find_non_overlaps($all_transcripts)
+   Description: identifies any non-overlapping transcripts
+   Returntype : array refs of stable IDs
+   Exceptions : none
+
+=cut
+
+sub find_non_overlaps {
+  my $self = shift;
+  my ($all_transcripts) = @_;
+  my $non_overlaps = [];
+  foreach my $transcript1 (@{$all_transcripts}) {
+    foreach my $transcript2 (@{$all_transcripts}) {
+      if ($transcript1->end < $transcript2->start) {
+	push @{$non_overlaps}, $transcript1->stable_id;
+	push @{$non_overlaps}, $transcript2->stable_id;
+      }
+    }
+  }
+  return $non_overlaps;
+}
+
+=head2 check_remarks_and_update_names
+
+   Arg[1]     : B::E::Gene (with potentially duplicated transcript names)
+   Arg[2]     : counter 1 (no. of patched genes)
+   Arg[3]     : counter 2 (no. of patched transcripts)
+   Example    : $support->update_names($gene,\$c1,\$c2)
+   Description: - checks remarks and patches transcripts with identical names according to
+                CDS and length
+   Returntype : true | false (depending on whether patched or not), counter1, counter2
+
+=cut
+
+sub check_remarks_and_update_names {
+  my $self = shift;
+  my ($gene,$gene_c,$trans_c) = @_;
+  my $action = ($self->param('dry_run')) ? 'Would add' : 'Added';
+  my $aa  = $gene->adaptor->db->get_AttributeAdaptor;
+  my $dbh = $gene->adaptor->db->dbc->db_handle;
+
+  #get list of IDs that have previously been sent to annotators
+  my $seen_genes = $self->get_havana_fragmented_loci_comments;
+
+  my $gsi    = $gene->stable_id;
+  my $gid    = $gene->dbID;
+  my $g_name;
+  my $study_more = 1;
+  eval {
+    $g_name = $gene->display_xref->display_id;
+  };	
+  if ($@) {
+    $g_name = $gene->get_all_Attributes('name')->[0]->value;
+  }
+
+  #get existing gene remarks
+  my $remarks = [ map {$_->value} @{$gene->get_all_Attributes('remark')} ];
+
+  #shout if there is no remark to identify this as being fragmented
+  if ( grep {$_ eq 'fragmented locus' } @$remarks) {
+    $study_more = 0;
+  }
+  else {
+    $self->log_warning("Gene $gsi should have a fragmented locus remark\n");
+  }
+
+  ##patch transcript names according to length and CDS
+  $gene_c++;
+
+  #separate coding and non_coding transcripts
+  my $coding_trans = [];
+  my $noncoding_trans = [];
+  foreach my $trans ( @{$gene->get_all_Transcripts()} ) {
+    if ($trans->translate) {
+      push @$coding_trans, $trans;
+    }
+    else {
+      push @$noncoding_trans, $trans;
+    }
+  }
+
+  #sort transcripts coding > non-coding, then on length
+  my $c = 0;
+  $self->log("\nPatching names according to CDS and length:\n",1);
+  foreach my $array_ref ($coding_trans,$noncoding_trans) {
+    foreach my $trans ( sort { $b->length <=> $a->length } @$array_ref ) {
+      $trans_c++;
+      my $tsi = $trans->stable_id;
+      my $t_name;
+      eval {
+	$t_name = $trans->display_xref->display_id;
+      };	
+      if ($@) {
+	$t_name = $trans->get_all_Attributes('name')->[0]->value;
+      }
+      $c++;
+      my $ext = sprintf("%03d", $c);
+      my $new_name = $g_name.'-'.$ext;
+      $self->log(sprintf("%-20s%-3s%-20s", "$t_name ", "-->", "$new_name")."\n",1);
+      if (! $self->param('dry_run')) {
+	
+	# update transcript display xref
+	$dbh->do(qq(UPDATE xref x, external_db edb
+                       SET x.display_label  = "$new_name"
+                     WHERE x.external_db_id = edb.external_db_id
+                       AND x.dbprimary_acc  = "$tsi"
+                       AND edb.db_name      = "Vega_transcript"));
+      }
+    }
+  }
+  return ($study_more,$gene_c,$trans_c);
+}
+
+=head2 check_names_and_overlap
+
+   Arg[1]     : arayref of arrayrefs of duplicated names
+   Arg[2]     : B::E::Gene (with potentially duplicated transcript names)
+   Arg[3]     : FH (to log new duplicates)
+   Example    : $support->check_names_and_overlap($transcripts,$gene,$fh)
+   Description: checks pairs of transcripts identified as having duplicate Vega names:
+                - to see if they have identical names in loutre (shouldn't have)
+                - distinguish between overlapping and non overlapping transcripts
+   Returntype : none
+
+=cut
+
+sub check_names_and_overlap {
+  my $self = shift;
+  my ($transcript_info,$gene,$n_flist_fh) = @_;
+  my $ta  = $gene->adaptor->db->get_TranscriptAdaptor;
+  my $gsi = $gene->stable_id;
+  my $g_name = $gene->get_all_Attributes('name')->[0]->value;
+  foreach my $set (values %{$transcript_info} ) {
+    next if (scalar @{$set} == 1);
+    my $transcripts = [];
+    my $all_t_names;
+    my %ids_to_names;
+    foreach my $id1 (@{$set}) {
+      my ($name1,$tsi1) = split /\|/, $id1;
+      $ids_to_names{$tsi1} = $name1;
+      $all_t_names .= "$tsi1 [$name1] ";
+      my $t = $ta->fetch_by_stable_id($tsi1);
+      push @{$transcripts}, $t;
+    }
+
+    my $non_overlaps;
+    eval {
+      $non_overlaps = $self->find_non_overlaps($transcripts);
+    };
+    if ($@) {
+      $self->log_warning("Problem looking for overlapping transcripts for gene $gsi (is_current = 0 ?). Skipping this bit\n");
+    }
+
+    #if the transcripts don't overlap
+    elsif (@{$non_overlaps}) {
+      my $tsi_string;
+      foreach my $id (@{$non_overlaps}) {
+	my $string = " $id [ $ids_to_names{$id} ] ";
+	$tsi_string .= $string;
+      }
+
+      $self->log_warning("NEW: Non-overlapping: $gsi ($g_name) has non-overlapping transcripts ($tsi_string) with duplicated Vega names, and it has no \'fragmented locus\' gene remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this needs checking by Havana.\n");
+      #log gsi (to be sent to Havana)
+      print $n_flist_fh "$gsi\n";
+    }
+    #...otherwise if the transcripts do overlap
+    else {
+      $self->log_warning("NEW: Overlapping: $gsi ($g_name) has overlapping transcripts ($all_t_names) with duplicated Vega names and it has no \'fragmented locus\' gene_remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this could be checked by Havana if they were feeling keen.\n");
+      print $n_flist_fh "$gsi\n";
+    }
+  }
+}		
+
+=head2 get_havana_fragmented_loci_comments
+
+   Args       : none
+   Example    : my $results = $support->get_havana_fragmented_loci_comments
+   Description: parses the HEREDOC containing Havana comments in this module
+   Returntype : hashref
+
+=cut
+
+sub get_havana_fragmented_loci_comments {
+  my $seen_genes;
+  while (<DATA>) {
+    next if /^\s+$/ or /#+/;
+    my ($obj,$comment) = split /=/;
+    $obj =~ s/^\s+|\s+$//g;
+    $comment =~ s/^\s+|\s+$//g;
+    $seen_genes->{$obj} = $comment;
+  }
+  return $seen_genes;
+}
+
+
+
+#details of genes with duplicated transcript names that have already been reported to Havana
+#identified as either fragmented or as being OK to patch
+__DATA__
+
+OTTMUSG00000005478 = fragmented
+OTTMUSG00000001936 = fragmented
+OTTMUSG00000017081 = fragmented
+OTTMUSG00000011441 = fragmented
+OTTMUSG00000013335 = fragmented
+OTTMUSG00000011654 = fragmented
+OTTMUSG00000001835 = fragmented
+OTTHUMG00000035221 = fragmented
+OTTHUMG00000037378 = fragmented
+OTTHUMG00000060732 = fragmented
+OTTHUMG00000132441 = fragmented
+OTTHUMG00000031383 = fragmented
+OTTHUMG00000012716 = fragmented
+OTTHUMG00000031102 = fragmented
+OTTHUMG00000148816 = fragmented
+OTTHUMG00000149059 = fragmented
+OTTHUMG00000149221 = fragmented
+OTTHUMG00000149326 = fragmented
+OTTHUMG00000149644 = fragmented
+OTTHUMG00000149574 = fragmented
+OTTHUMG00000058101 = fragmented
+
+OTTHUMG00000150119 = OK
+OTTHUMG00000149850 = OK
+OTTHUMG00000058101 = OK
+OTTHUMG00000058907 = OK
+
+OTTMUSG00000011654 = fragmented
+OTTMUSG00000019369 = fragmented
+OTTMUSG00000017081 = fragmented
+OTTMUSG00000001835 = fragmented
+OTTMUSG00000011499 = fragmented
+OTTMUSG00000013335 = fragmented
+OTTMUSG00000008023 = fragmented
+OTTMUSG00000019369 = fragmented
+
+
+OTTMUSG00000022266
+OTTMUSG00000006697
+
+
+
+
+
+OTTMUSG00000012302 =
+OTTMUSG00000013368 =
+OTTMUSG00000015766 =
+OTTMUSG00000016025 =
+OTTMUSG00000001066 =
+OTTMUSG00000016331 =
+OTTMUSG00000006935 =
+OTTMUSG00000007263 =
+OTTMUSG00000000304 =
+OTTMUSG00000009150 =
+OTTMUSG00000008023 =
+OTTMUSG00000017077 =
+OTTMUSG00000003440 =
+OTTMUSG00000016310 =
+OTTMUSG00000026199 =
+OTTMUSG00000028423 =
+OTTMUSG00000007427 =
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children