diff variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,447 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+  This software is distributed under a modified Apache license.
+  For license details, please see
+    http://www.ensembl.org/info/about/code_licence.html
+=head1 CONTACT
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+=head1 NAME
+=head1 SYNOPSIS
+=head1 DESCRIPTION
+=head1 METHODS
+=cut
+
+package Bio::EnsEMBL::Utils::VegaCuration::Translation;
+use strict;
+use warnings;
+use vars qw(@ISA);
+use Data::Dumper;
+use Bio::EnsEMBL::Utils::VegaCuration::Transcript;
+@ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Transcript);
+
+=head2 check_CDS_start_end_remarks
+
+   Args       : B::E::Transcript
+   Example    : my $results = $support->check_CDS_end_remarks($transcript)
+   Description: identifies incorrect 'CDS end...' transcript remarks in a
+                otter-derived Vega database
+   Returntype : hashref
+
+=cut
+
+sub check_CDS_start_end_remarks {
+  my $self = shift; 
+  my $trans = shift;
+  # info for checking
+  my @remarks = @{$trans->get_all_Attributes('remark')};
+  my $coding_end   = $trans->cdna_coding_end;
+  my $coding_start = $trans->cdna_coding_start;
+  my $trans_end    = $trans->length;
+  my $trans_seq    = $trans->seq->seq;
+  my $stop_codon   = substr($trans_seq, $coding_end-3, 3);
+  my $start_codon  = substr($trans_seq, $coding_start-1, 3);
+  #hashref to return results
+  my $results;
+  
+  #extra CDS end not found remarks
+  if (grep {$_->value eq 'CDS end not found'} @remarks) {
+    if (   ($coding_end != $trans_end) 
+	     && ( grep {$_ eq $stop_codon} qw(TGA TAA TAG) ) ) {
+      $results->{'END_EXTRA'} = 1;
+    }
+  }
+  #missing CDS end not found remark
+  if ( $coding_end == $trans_end ) {
+    if (! grep {$_->value eq 'CDS end not found'} @remarks) {
+      if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) {
+	$results->{'END_MISSING_2'} = 1;
+      }
+      else {
+	$results->{'END_MISSING_1'} = $stop_codon;
+      }
+    }
+  }
+  #extra CDS start not found remark
+  if (grep {$_->value eq 'CDS start not found'} @remarks) {
+    if (   ($coding_start != 1) 
+	     && ($start_codon eq 'ATG') ) {
+      $results->{'START_EXTRA'} = 1;
+    }
+  }
+  #missing CDS start not found remark
+  if ( $coding_start == 1) {
+    if ( ! grep {$_->value eq 'CDS start not found'} @remarks) {
+      if ($start_codon eq 'ATG') {
+	$results->{'START_MISSING_2'} = 1;
+      } else {
+	$results->{'START_MISSING_1'} = $start_codon;
+      }
+    }
+  }
+  return $results;
+}
+
+=head2 check_CDS_start_end_remarks_loutre
+
+   Args       : B::E::Transcript
+   Example    : my $results = $support->check_CDS_end_remarks($transcript)
+   Description: identifies incorrect 'CDS end...' transcript attribs in a loutre
+                of a loutre-derived Vega database.
+   Returntype : hashref
+
+=cut
+
+sub check_CDS_start_end_remarks_loutre {
+  my $self = shift;
+  my $trans = shift;
+
+  # info for checking
+  my @stops = qw(TGA TAA TAG);
+  my %attributes;
+  foreach my $attribute (@{$trans->get_all_Attributes()}) {
+    push @{$attributes{$attribute->code}}, $attribute;
+  }
+#  warn $trans->stable_id;
+#  warn Data::Dumper::Dumper(\%attributes);
+  my $coding_end   = $trans->cdna_coding_end;
+  my $coding_start = $trans->cdna_coding_start;
+  my $trans_end    = $trans->length;
+  my $trans_seq    = $trans->seq->seq;
+  my $stop_codon_offset = 3 + $trans->translation->end_Exon->end_phase;
+  my $initial_exon_phase = $trans->translation->start_Exon->phase;
+  my $stop_codon   = substr($trans_seq, $coding_end-3, 3);
+  my $start_codon  = substr($trans_seq, $coding_start-1, 3);
+
+  my $start_codon_incorrect = 1;
+  if ($start_codon eq 'ATG' ) {
+    $start_codon_incorrect = 0;
+  }
+  elsif ($start_codon eq 'CTG') {
+    foreach my $attrib (@{$attributes{'remark'}}) {
+      if ($attrib->value =~ /non[- ]ATG start/) {
+	$start_codon_incorrect = 0;
+      }
+    }
+  }
+
+#  warn "$start_codon -- $initial_exon_phase -- $coding_start -- $start_codon_incorrect";
+
+  #hashref to return results
+  my $results;
+
+  #extra CDS end not found remarks
+  if ($attributes{'cds_end_NF'}) {
+    if ( ($attributes{'cds_end_NF'}->[0]->value == 1)
+	   && ($coding_end != $trans_end) 
+	   && ( grep {$_ eq $stop_codon} @stops) ) {
+#      warn $trans->stable_id.": $coding_end--$trans_end--$stop_codon";
+#      warn $trans->translation->end_Exon->end_phase;
+      $results->{'END_EXTRA'} = $stop_codon;
+    }
+  }
+  #missing CDS end not found remark
+  if ( $coding_end == $trans_end ) {
+    if ($attributes{'cds_end_NF'}) {
+      if ($attributes{'cds_end_NF'}->[0]->value == 0 ) {
+	if (! grep {$_ eq $stop_codon} @stops) {
+#	  warn $trans->translation->end_Exon->end_phase;
+	  $results->{'END_MISSING'}{'WRONG'} = $stop_codon;
+	}
+      }
+    }
+    elsif (! grep {$_ eq $stop_codon} @stops) {
+      $results->{'END_MISSING'}{'ABSENT'} = $stop_codon;
+    }
+  }
+  #extra CDS start not found remark 
+  if ( $attributes{'cds_start_NF'}) {
+    if (   ($attributes{'cds_start_NF'}->[0]->value == 1 )
+        && (! $start_codon_incorrect)) {
+      unless ( ($coding_start == 1) && ( $initial_exon_phase > 0)) {
+	$results->{'START_EXTRA'} = $start_codon;
+      }
+    }
+  }
+  #missing CDS start not found remark
+  if ( $coding_start == 1) {
+    if ( $attributes{'cds_start_NF'} ) {
+      if ( $attributes{'cds_start_NF'}->[0]->value == 0 ) {
+	if ($start_codon_incorrect) {
+	  $results->{'START_MISSING'}{'ABSENT'} = $start_codon;
+	}
+	elsif ($initial_exon_phase > 0) {
+	  $results->{'START_MISSING'}{'INITIAL_PHASE'} = $initial_exon_phase;
+	}
+      }
+    }
+    elsif ($start_codon ne 'ATG') {
+      $results->{'START_MISSING'}{'ABSENT'} = $start_codon;
+    }
+
+  }
+  return $results;
+}
+
+=head2 get_havana_seleno_comments
+
+   Args       : none
+   Example    : my $results = $support->get_havana_seleno_comments
+   Description: parses the HEREDOC containing Havana comments in this module
+   Returntype : hashref
+
+=cut
+
+sub get_havana_seleno_comments {
+  my $seen_translations;
+  while (<DATA>) {
+    next if /^\s+$/ or /#+/;
+    my ($obj,$comment) = split /=/;
+    $obj =~ s/^\s+|\s+$//g;
+    $comment =~ s/^\s+|\s+$//g;
+    # We add the origin as now "seen" can come from a number of places, and have
+    # a number of consequences in different cases, not just discounted Secs from this method. -- ds23
+    $seen_translations->{$obj} = [ $comment,"notsec-havana" ];
+  }
+  return $seen_translations;
+}
+
+sub check_for_stops {
+  my $support = shift;
+  my ($gene,$seen_transcripts,$log_object) = @_;
+  my $transcripts;
+  my $has_log_object=defined($log_object);
+  if($has_log_object){
+    my @help = $log_object->species_params->get_trans($gene->stable_id);
+    $transcripts=\@help;
+  }else{
+    $log_object=$support;
+    $transcripts=$gene->get_all_Transcripts;
+  }
+
+  my $gname = $gene->get_all_Attributes('name')->[0]->value;
+  my $gsi = $gene->stable_id;
+  my $scodon = 'TGA';
+  my $mod_date = $support->date_format( $gene->modified_date,'%d/%m/%y' );
+  my $hidden_remak_ttributes;
+ TRANS:
+  foreach my $trans (@$transcripts) {
+    my $tsi = $trans->stable_id;
+    my $tID = $trans->dbID;
+    my $tname = $trans->get_all_Attributes('name')->[0]->value;
+    if($has_log_object){
+      $hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{'hidden_remark'};
+    }else{
+      $hidden_remak_ttributes=$trans->get_all_Attributes('hidden_remark');
+    }
+    foreach my $rem (@$hidden_remak_ttributes) {
+      if ($rem->value =~ /not_for_Vega/) {
+        #$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1);
+        $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Skipping transcript $tname ($tsi) since 'not_for_Vega'");
+        next TRANS;
+      }
+    }
+
+    # go no further if there is a ribosomal framshift attribute
+    foreach my $attrib (@{$trans->get_all_Attributes('_rib_frameshift')}) {
+      if ($attrib->value) {
+        $log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift attribute");
+        next TRANS;
+      }
+    }
+
+    #$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1);
+    $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Studying transcript $tsi ($tname, $tID)");
+    my $peptide;
+		
+    # go no further if the transcript doesn't translate or if there are no stops
+    next TRANS unless ($peptide = $trans->translate);
+    my $pseq = $peptide->seq;
+    my $orig_seq = $pseq;
+    # (translate method trims stops from sequence end)
+
+    next TRANS unless ($pseq =~ /\*/);
+    # warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3));
+    #$support->log_verbose("Stops found in $tsi ($tname)\n",1);
+    $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Stops found in $tsi ($tname)");
+
+    # find out where and how many stops there are
+    my @found_stops;
+    my $mrna   = $trans->translateable_seq;
+    my $offset = 0;
+    my $tstop;
+    while ($pseq =~ /^([^\*]*)\*(.*)/) {
+      my $pseq1_f = $1;
+      $pseq = $2;
+      my $seq_flag = 0;
+      $offset += length($pseq1_f) * 3;
+      my $stop = substr($mrna, $offset, 3);
+      my $aaoffset = int($offset / 3)+1;
+      push(@found_stops, [ $stop, $aaoffset ]);
+      $tstop .= "$aaoffset ";
+      $offset += 3;
+    }
+		
+    # are all stops TGA...?
+    my $num_stops = scalar(@found_stops);
+    my $num_tga = 0;
+    my $positions;
+    foreach my $stop (@found_stops) {
+      $positions .= $stop->[0]."(".$stop->[1].") ";
+      if ($stop->[0] eq $scodon) {
+	$num_tga++;
+      }
+    }
+    my $source = $gene->source;
+    #...no - an internal stop codon error in the database...
+    if ($num_tga < $num_stops) {
+      if ($source eq 'havana') {
+        #$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
+        $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)");
+      }
+      else {
+        #$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
+        $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions)");  
+      }
+    }
+    #...yes - check remarks
+    else {
+      my $flag_remark  = 0; # 1 if word seleno has been used
+      my $flag_remark2 = 0; # 1 if existing remark has correct numbering
+      my $alabel       = 'Annotation_remark- selenocysteine ';
+      my $alabel2      = 'selenocysteine ';
+      my $annot_stops;
+      my $remarks;
+      my $att;
+      #get both hidden_remarks and remarks
+      foreach my $remark_type ('remark','hidden_remark') {
+        if($has_log_object){
+          $att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type};
+        }else{
+          $att=$trans->get_all_Attributes($remark_type)
+        }
+        foreach my $attrib ( @$att) {
+          push @{$remarks->{$remark_type}}, $attrib->value;
+        }
+      }
+      #parse remarks to check syntax for location of edits
+      while (my ($attrib,$remarks)= each %$remarks) {
+        foreach my $text (@{$remarks}) {					
+          if ( ($attrib eq 'remark') && ($text=~/^$alabel(.*)/) ){
+            #$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n");
+            $log_object->_save_log('log_warning', '', $gsi, '', $tsi, 'VQCT_wrong_selC_coord', "seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]");        
+            $annot_stops=$1;
+          }
+          elsif ($text =~ /^$alabel2(.*)/) {
+            my $maybe = $1;
+            if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) {
+              $annot_stops=$maybe;
+            } else {
+              $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, '', "Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'".
+                                                                                " -- might need to investigate: '$alabel2$maybe' [$mod_date]");
+            }
+          }
+        }
+      }
+
+      #check the location of the annotated edits matches actual stops in the sequence
+      my @annotated_stops;
+      if ($annot_stops){
+        my $i = 0;
+        foreach my $offset (split(/\s+/, $annot_stops)) {
+          #OK if it matches a known stop
+          if (
+            defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) {
+            push  @annotated_stops, $offset;
+          }
+          # catch old annotations where number was in DNA not peptide coordinates
+          elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) {
+            $log_object->_save_log('log_warning', '', $gene->stable_id, 'DNA', $tsi, 'VQCT_wrong_selC_coord', "DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]");
+          }
+          # catch old annotations where number off by one
+          elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) {
+            $log_object->_save_log('log_warning', '', $gene->stable_id, 'PEPTIDE', $tsi, 'VQCT_wrong_selC_coord', "PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]");
+          }
+          elsif (defined($offset)  && ($offset=~/^\d+$/)){
+            if ($offset == length($orig_seq)+1) {
+              if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-tga-stop') {
+                $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]");
+              } elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-terminal-sec') {
+                $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]");
+              } else {
+                $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please investigate and add appropriate entry to config arrays in add_selcys.pl. [$mod_date]");
+              }
+            }
+            else {
+              $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, 'VQCT_wrong_selC_coord', "Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]");
+              push  @annotated_stops, $offset;
+            }
+          }						
+          $i++;
+        }
+      }
+
+      #check location of found stops matches annotated ones - any new ones are reported
+      foreach my $stop ( @found_stops ) {
+        my $pos = $stop->[1];
+        my $seq = $stop->[0];
+        unless ( grep { $pos == $_} @annotated_stops) {
+          if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'notsec-havana') {
+            #$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n");
+            $log_object->_save_log('log_verbose', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].") [$mod_date]");
+          }
+          else {
+            #$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n");
+            $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]");
+          }
+        }
+      }
+    }
+  }
+}
+sub _save_log{
+  my $self=shift;
+  my $log_type = shift;
+  my $chrom_name=shift || '';
+  my $gsi=shift || '';
+  my $type=shift || '';  
+  my $tsi=shift || '';  
+  my $tag=shift || '';
+  my $txt=shift || '';
+  $self->$log_type($txt."\n");
+}
+
+#details of annotators comments
+__DATA__
+OTTHUMT00000144659 = FIXED- changed to transcript
+OTTHUMT00000276377 = FIXED- changed to transcript
+OTTHUMT00000257741 = FIXED- changed to nmd
+OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
+OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
+OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene
+OTTHUMT00000285227 = FIXED- changed start site
+OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon
+OTTHUMT00000157999 = FIXED- changed incorrect stop
+OTTHUMT00000150523 = FIXED- incorrect trimming of CDS
+OTTHUMT00000150525 = FIXED- incorrect trimming of CDS
+OTTHUMT00000150522 = FIXED- incorrect trimming of CDS
+OTTHUMT00000150521 = FIXED- incorrect trimming of CDS
+OTTHUMT00000246819 = FIXED- corrected frame
+OTTHUMT00000314078 = FIXED- corrected frame
+OTTHUMT00000080133 = FIXED- corrected frame
+OTTHUMT00000286423 = FIXED- changed to transcript
+OTTMUST00000055509 = FIXED- error
+OTTMUST00000038729 = FIXED- corrected frame
+OTTMUST00000021760 = FIXED- corrected frame
+OTTMUST00000023057 = FIXED- corrected frame
+OTTMUST00000015207 = FIXED- corrected frame
+OTTMUST00000056646 = FIXED- error
+OTTMUST00000059686 = FIXED- corrected frame
+OTTMUST00000013426 = FIXED- corrected frame
+OTTMUST00000044412 = FIXED- error