Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,447 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + This software is distributed under a modified Apache license. + For license details, please see + http://www.ensembl.org/info/about/code_licence.html +=head1 CONTACT + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut +=head1 NAME +=head1 SYNOPSIS +=head1 DESCRIPTION +=head1 METHODS +=cut + +package Bio::EnsEMBL::Utils::VegaCuration::Translation; +use strict; +use warnings; +use vars qw(@ISA); +use Data::Dumper; +use Bio::EnsEMBL::Utils::VegaCuration::Transcript; +@ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Transcript); + +=head2 check_CDS_start_end_remarks + + Args : B::E::Transcript + Example : my $results = $support->check_CDS_end_remarks($transcript) + Description: identifies incorrect 'CDS end...' transcript remarks in a + otter-derived Vega database + Returntype : hashref + +=cut + +sub check_CDS_start_end_remarks { + my $self = shift; + my $trans = shift; + # info for checking + my @remarks = @{$trans->get_all_Attributes('remark')}; + my $coding_end = $trans->cdna_coding_end; + my $coding_start = $trans->cdna_coding_start; + my $trans_end = $trans->length; + my $trans_seq = $trans->seq->seq; + my $stop_codon = substr($trans_seq, $coding_end-3, 3); + my $start_codon = substr($trans_seq, $coding_start-1, 3); + #hashref to return results + my $results; + + #extra CDS end not found remarks + if (grep {$_->value eq 'CDS end not found'} @remarks) { + if ( ($coding_end != $trans_end) + && ( grep {$_ eq $stop_codon} qw(TGA TAA TAG) ) ) { + $results->{'END_EXTRA'} = 1; + } + } + #missing CDS end not found remark + if ( $coding_end == $trans_end ) { + if (! grep {$_->value eq 'CDS end not found'} @remarks) { + if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) { + $results->{'END_MISSING_2'} = 1; + } + else { + $results->{'END_MISSING_1'} = $stop_codon; + } + } + } + #extra CDS start not found remark + if (grep {$_->value eq 'CDS start not found'} @remarks) { + if ( ($coding_start != 1) + && ($start_codon eq 'ATG') ) { + $results->{'START_EXTRA'} = 1; + } + } + #missing CDS start not found remark + if ( $coding_start == 1) { + if ( ! grep {$_->value eq 'CDS start not found'} @remarks) { + if ($start_codon eq 'ATG') { + $results->{'START_MISSING_2'} = 1; + } else { + $results->{'START_MISSING_1'} = $start_codon; + } + } + } + return $results; +} + +=head2 check_CDS_start_end_remarks_loutre + + Args : B::E::Transcript + Example : my $results = $support->check_CDS_end_remarks($transcript) + Description: identifies incorrect 'CDS end...' transcript attribs in a loutre + of a loutre-derived Vega database. + Returntype : hashref + +=cut + +sub check_CDS_start_end_remarks_loutre { + my $self = shift; + my $trans = shift; + + # info for checking + my @stops = qw(TGA TAA TAG); + my %attributes; + foreach my $attribute (@{$trans->get_all_Attributes()}) { + push @{$attributes{$attribute->code}}, $attribute; + } +# warn $trans->stable_id; +# warn Data::Dumper::Dumper(\%attributes); + my $coding_end = $trans->cdna_coding_end; + my $coding_start = $trans->cdna_coding_start; + my $trans_end = $trans->length; + my $trans_seq = $trans->seq->seq; + my $stop_codon_offset = 3 + $trans->translation->end_Exon->end_phase; + my $initial_exon_phase = $trans->translation->start_Exon->phase; + my $stop_codon = substr($trans_seq, $coding_end-3, 3); + my $start_codon = substr($trans_seq, $coding_start-1, 3); + + my $start_codon_incorrect = 1; + if ($start_codon eq 'ATG' ) { + $start_codon_incorrect = 0; + } + elsif ($start_codon eq 'CTG') { + foreach my $attrib (@{$attributes{'remark'}}) { + if ($attrib->value =~ /non[- ]ATG start/) { + $start_codon_incorrect = 0; + } + } + } + +# warn "$start_codon -- $initial_exon_phase -- $coding_start -- $start_codon_incorrect"; + + #hashref to return results + my $results; + + #extra CDS end not found remarks + if ($attributes{'cds_end_NF'}) { + if ( ($attributes{'cds_end_NF'}->[0]->value == 1) + && ($coding_end != $trans_end) + && ( grep {$_ eq $stop_codon} @stops) ) { +# warn $trans->stable_id.": $coding_end--$trans_end--$stop_codon"; +# warn $trans->translation->end_Exon->end_phase; + $results->{'END_EXTRA'} = $stop_codon; + } + } + #missing CDS end not found remark + if ( $coding_end == $trans_end ) { + if ($attributes{'cds_end_NF'}) { + if ($attributes{'cds_end_NF'}->[0]->value == 0 ) { + if (! grep {$_ eq $stop_codon} @stops) { +# warn $trans->translation->end_Exon->end_phase; + $results->{'END_MISSING'}{'WRONG'} = $stop_codon; + } + } + } + elsif (! grep {$_ eq $stop_codon} @stops) { + $results->{'END_MISSING'}{'ABSENT'} = $stop_codon; + } + } + #extra CDS start not found remark + if ( $attributes{'cds_start_NF'}) { + if ( ($attributes{'cds_start_NF'}->[0]->value == 1 ) + && (! $start_codon_incorrect)) { + unless ( ($coding_start == 1) && ( $initial_exon_phase > 0)) { + $results->{'START_EXTRA'} = $start_codon; + } + } + } + #missing CDS start not found remark + if ( $coding_start == 1) { + if ( $attributes{'cds_start_NF'} ) { + if ( $attributes{'cds_start_NF'}->[0]->value == 0 ) { + if ($start_codon_incorrect) { + $results->{'START_MISSING'}{'ABSENT'} = $start_codon; + } + elsif ($initial_exon_phase > 0) { + $results->{'START_MISSING'}{'INITIAL_PHASE'} = $initial_exon_phase; + } + } + } + elsif ($start_codon ne 'ATG') { + $results->{'START_MISSING'}{'ABSENT'} = $start_codon; + } + + } + return $results; +} + +=head2 get_havana_seleno_comments + + Args : none + Example : my $results = $support->get_havana_seleno_comments + Description: parses the HEREDOC containing Havana comments in this module + Returntype : hashref + +=cut + +sub get_havana_seleno_comments { + my $seen_translations; + while (<DATA>) { + next if /^\s+$/ or /#+/; + my ($obj,$comment) = split /=/; + $obj =~ s/^\s+|\s+$//g; + $comment =~ s/^\s+|\s+$//g; + # We add the origin as now "seen" can come from a number of places, and have + # a number of consequences in different cases, not just discounted Secs from this method. -- ds23 + $seen_translations->{$obj} = [ $comment,"notsec-havana" ]; + } + return $seen_translations; +} + +sub check_for_stops { + my $support = shift; + my ($gene,$seen_transcripts,$log_object) = @_; + my $transcripts; + my $has_log_object=defined($log_object); + if($has_log_object){ + my @help = $log_object->species_params->get_trans($gene->stable_id); + $transcripts=\@help; + }else{ + $log_object=$support; + $transcripts=$gene->get_all_Transcripts; + } + + my $gname = $gene->get_all_Attributes('name')->[0]->value; + my $gsi = $gene->stable_id; + my $scodon = 'TGA'; + my $mod_date = $support->date_format( $gene->modified_date,'%d/%m/%y' ); + my $hidden_remak_ttributes; + TRANS: + foreach my $trans (@$transcripts) { + my $tsi = $trans->stable_id; + my $tID = $trans->dbID; + my $tname = $trans->get_all_Attributes('name')->[0]->value; + if($has_log_object){ + $hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{'hidden_remark'}; + }else{ + $hidden_remak_ttributes=$trans->get_all_Attributes('hidden_remark'); + } + foreach my $rem (@$hidden_remak_ttributes) { + if ($rem->value =~ /not_for_Vega/) { + #$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1); + $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Skipping transcript $tname ($tsi) since 'not_for_Vega'"); + next TRANS; + } + } + + # go no further if there is a ribosomal framshift attribute + foreach my $attrib (@{$trans->get_all_Attributes('_rib_frameshift')}) { + if ($attrib->value) { + $log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift attribute"); + next TRANS; + } + } + + #$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1); + $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Studying transcript $tsi ($tname, $tID)"); + my $peptide; + + # go no further if the transcript doesn't translate or if there are no stops + next TRANS unless ($peptide = $trans->translate); + my $pseq = $peptide->seq; + my $orig_seq = $pseq; + # (translate method trims stops from sequence end) + + next TRANS unless ($pseq =~ /\*/); + # warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3)); + #$support->log_verbose("Stops found in $tsi ($tname)\n",1); + $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Stops found in $tsi ($tname)"); + + # find out where and how many stops there are + my @found_stops; + my $mrna = $trans->translateable_seq; + my $offset = 0; + my $tstop; + while ($pseq =~ /^([^\*]*)\*(.*)/) { + my $pseq1_f = $1; + $pseq = $2; + my $seq_flag = 0; + $offset += length($pseq1_f) * 3; + my $stop = substr($mrna, $offset, 3); + my $aaoffset = int($offset / 3)+1; + push(@found_stops, [ $stop, $aaoffset ]); + $tstop .= "$aaoffset "; + $offset += 3; + } + + # are all stops TGA...? + my $num_stops = scalar(@found_stops); + my $num_tga = 0; + my $positions; + foreach my $stop (@found_stops) { + $positions .= $stop->[0]."(".$stop->[1].") "; + if ($stop->[0] eq $scodon) { + $num_tga++; + } + } + my $source = $gene->source; + #...no - an internal stop codon error in the database... + if ($num_tga < $num_stops) { + if ($source eq 'havana') { + #$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n"); + $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)"); + } + else { + #$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n"); + $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions)"); + } + } + #...yes - check remarks + else { + my $flag_remark = 0; # 1 if word seleno has been used + my $flag_remark2 = 0; # 1 if existing remark has correct numbering + my $alabel = 'Annotation_remark- selenocysteine '; + my $alabel2 = 'selenocysteine '; + my $annot_stops; + my $remarks; + my $att; + #get both hidden_remarks and remarks + foreach my $remark_type ('remark','hidden_remark') { + if($has_log_object){ + $att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type}; + }else{ + $att=$trans->get_all_Attributes($remark_type) + } + foreach my $attrib ( @$att) { + push @{$remarks->{$remark_type}}, $attrib->value; + } + } + #parse remarks to check syntax for location of edits + while (my ($attrib,$remarks)= each %$remarks) { + foreach my $text (@{$remarks}) { + if ( ($attrib eq 'remark') && ($text=~/^$alabel(.*)/) ){ + #$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n"); + $log_object->_save_log('log_warning', '', $gsi, '', $tsi, 'VQCT_wrong_selC_coord', "seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]"); + $annot_stops=$1; + } + elsif ($text =~ /^$alabel2(.*)/) { + my $maybe = $1; + if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) { + $annot_stops=$maybe; + } else { + $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, '', "Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'". + " -- might need to investigate: '$alabel2$maybe' [$mod_date]"); + } + } + } + } + + #check the location of the annotated edits matches actual stops in the sequence + my @annotated_stops; + if ($annot_stops){ + my $i = 0; + foreach my $offset (split(/\s+/, $annot_stops)) { + #OK if it matches a known stop + if ( + defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) { + push @annotated_stops, $offset; + } + # catch old annotations where number was in DNA not peptide coordinates + elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) { + $log_object->_save_log('log_warning', '', $gene->stable_id, 'DNA', $tsi, 'VQCT_wrong_selC_coord', "DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]"); + } + # catch old annotations where number off by one + elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) { + $log_object->_save_log('log_warning', '', $gene->stable_id, 'PEPTIDE', $tsi, 'VQCT_wrong_selC_coord', "PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]"); + } + elsif (defined($offset) && ($offset=~/^\d+$/)){ + if ($offset == length($orig_seq)+1) { + if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-tga-stop') { + $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]"); + } elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-terminal-sec') { + $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]"); + } else { + $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please investigate and add appropriate entry to config arrays in add_selcys.pl. [$mod_date]"); + } + } + else { + $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, 'VQCT_wrong_selC_coord', "Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]"); + push @annotated_stops, $offset; + } + } + $i++; + } + } + + #check location of found stops matches annotated ones - any new ones are reported + foreach my $stop ( @found_stops ) { + my $pos = $stop->[1]; + my $seq = $stop->[0]; + unless ( grep { $pos == $_} @annotated_stops) { + if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'notsec-havana') { + #$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n"); + $log_object->_save_log('log_verbose', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].") [$mod_date]"); + } + else { + #$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n"); + $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]"); + } + } + } + } + } +} +sub _save_log{ + my $self=shift; + my $log_type = shift; + my $chrom_name=shift || ''; + my $gsi=shift || ''; + my $type=shift || ''; + my $tsi=shift || ''; + my $tag=shift || ''; + my $txt=shift || ''; + $self->$log_type($txt."\n"); +} + +#details of annotators comments +__DATA__ +OTTHUMT00000144659 = FIXED- changed to transcript +OTTHUMT00000276377 = FIXED- changed to transcript +OTTHUMT00000257741 = FIXED- changed to nmd +OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed +OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed +OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene +OTTHUMT00000285227 = FIXED- changed start site +OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon +OTTHUMT00000157999 = FIXED- changed incorrect stop +OTTHUMT00000150523 = FIXED- incorrect trimming of CDS +OTTHUMT00000150525 = FIXED- incorrect trimming of CDS +OTTHUMT00000150522 = FIXED- incorrect trimming of CDS +OTTHUMT00000150521 = FIXED- incorrect trimming of CDS +OTTHUMT00000246819 = FIXED- corrected frame +OTTHUMT00000314078 = FIXED- corrected frame +OTTHUMT00000080133 = FIXED- corrected frame +OTTHUMT00000286423 = FIXED- changed to transcript +OTTMUST00000055509 = FIXED- error +OTTMUST00000038729 = FIXED- corrected frame +OTTMUST00000021760 = FIXED- corrected frame +OTTMUST00000023057 = FIXED- corrected frame +OTTMUST00000015207 = FIXED- corrected frame +OTTMUST00000056646 = FIXED- error +OTTMUST00000059686 = FIXED- corrected frame +OTTMUST00000013426 = FIXED- corrected frame +OTTMUST00000044412 = FIXED- error
