Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/TranscriptFactory.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/TranscriptFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,325 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::TranscriptFactory - Module having the fset2transcript* +subroutines + +=head1 SYNOPSIS + + use Bio::EnsEMBL::TranscriptFactory; + + &Bio::EnsEMBL::TranscriptFactory::fset2transcript($fset_id); + +=head1 DESCRIPTION + +Module containing the subroutines fset2transcript*, +which create transcripts from features (formally housed in +Bio::EnsEMBL::DBSQL::Utils). + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::TranscriptFactory; + +use strict; + +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::Translation; +use Bio::EnsEMBL::Transcript; + +sub fset2transcript { + my ($genscan,$contig)=@_; + + + unless ($genscan->isa ("Bio::EnsEMBL::SeqFeatureI")) + {print "$genscan must be Bio::EnsEMBL::SeqFeatureI\n";} + + my $transcript = new Bio::EnsEMBL::Transcript; + $transcript->temporary_id($contig->id . "." . $genscan->seqname); + + my @exons; + my $count= 1; + + foreach my $f ($genscan->sub_SeqFeature) { + + my $exon = new Bio::EnsEMBL::Exon; + $transcript->add_Exon($exon); + $exon->contig ($contig); + $exon->start ($f->start); + $exon->end ($f->end ); + $exon->strand ($f->strand); + $exon->phase ($f->phase); + $exon->end_phase( ($exon->phase + $exon->length)%3 ); + #$exon->score($f->score); +# $exon->p_value($f->p_value); + $exon->slice($contig->primary_seq); + + push(@exons,$exon); + $count++; + + } + + if( $count == 1 ) { + $genscan->throw("Got a 0 exon genscan"); + } + + my $translation = new Bio::EnsEMBL::Translation; + # + # This code got changed due to Translation convention changing. Should work... + # + + if ($exons[0]->strand == 1) { + @exons = sort {$a->start <=> $b->start} @exons; + } else { + @exons = sort {$b->start <=> $a->start} @exons; + } + + $translation->start(1); + $translation->end($exons[scalar(@exons)-1]->length); + + $translation->start_Exon($exons[0]); + $translation->end_Exon($exons[$#exons]); + + my $endphase = $exons[0]->end_phase; + + foreach my $exon (@exons) { + + if ( $exon == $exons[0] ){ + next; + } + $exon->phase ($endphase); + $endphase = $exon->end_phase; + } + + $transcript->translation($translation); + + return $transcript; +} + +sub fset2transcript_guess_phases { + my ($fset,$contig) = @_; + + my $transcript = new Bio::EnsEMBL::Transcript; + + $transcript->temporary_id($contig->id . "." . $fset->id); + + + my @exons; + my $count = 1; + + foreach my $f ($fset->sub_SeqFeature) { + + my $exon = new Bio::EnsEMBL::Exon; + $exon->contig ($contig); + $exon->start ($f->start); + $exon->end ($f->end ); + $exon->strand ($f->strand); + #$exon->score($f->score); +# $exon->p_value($f->p_value); + $exon->slice($contig); + $exon->phase($f->phase); + push(@exons,$exon); + $count++; + + } + + my $translation = new Bio::EnsEMBL::Translation; + + if ($exons[0]->strand == 1) { + @exons = sort {$a->start <=> $b->start} @exons; + } else { + @exons = sort {$b->start <=> $a->start} @exons; + } + + $translation->start (1); + $translation->end ($exons[$#exons]->end - $exons[$#exons]->start + 1); + $translation->start_Exon($exons[0]); + $translation->end_Exon($exons[$#exons]); + $transcript->translation($translation); + + my $endphase = 0; + + foreach my $exon (@exons) { + + $exon ->phase ($endphase); + $transcript->add_Exon($exon); + + $endphase = $exon->end_phase(($exon->phase + $exon->length)%3); + + } + + + if ($transcript->translate->seq !~ /\*/) { + return $transcript; + } + + $endphase = 1; + + foreach my $exon (@exons) { + $exon->phase($endphase); + $endphase = $exon->end_phase(($exon->phase + $exon->length)%3); + } + + if ($transcript->translate->seq !~ /\*/) { + return $transcript; + } + + $endphase = 2; + + foreach my $exon (@exons) { + $exon->phase($endphase); + $endphase = $exon->end_phase(($exon->phase + $exon->length)%3); + } + + if ($transcript->translate->seq !~ /\*/) { + return $transcript; + } +} + +sub fset2transcript_3frame { + my ($fset,$contig) = @_; + + my @f = $fset->sub_SeqFeature; + + if ($f[0]->strand == 1) { + @f = sort {$a->start <=> $b->start} @f; + } else { + @f = sort {$b->start <=> $a->start} @f; + } + + my @transcripts; + + my $startphase = 0; + + while ($startphase < 3) { + my $endphase = $startphase; + + my $transcript = new Bio::EnsEMBL::Transcript; + + push(@transcripts,$transcript); + + $transcript->temporary_id($contig->id . "." . $endphase); + + my $count = 1; + my @exons; + + + foreach my $f (@f) { + #print "exon seqname = ".$f->seqname."\n"; + my $exon = new Bio::EnsEMBL::Exon; + #print STDERR "exon ".$f->gffstring."\n"; + push(@exons,$exon); + $exon->seqname($f->seqname); + $exon->temporary_id ($contig->id . ".$count"); + $exon->contig ($contig); + $exon->start ($f->start); + $exon->end ($f->end ); + $exon->strand ($f->strand); + $exon->slice($contig); + $exon->phase ($endphase); + $exon->end_phase( ($exon->phase + $exon->length)%3 ); + #$exon->score ($f->score); +# $exon->p_value ($f->p_value); + $endphase = $exon->end_phase; + + $transcript->add_Exon($exon); + $count++; + + #print STDERR "Added exon start " . $exon->start . " end " . $exon->end . " strand " . $exon->strand . " score " . $exon->score . " pvalue " . $exon->p_value . "\n"; + } + + my $translation = new Bio::EnsEMBL::Translation; + + my $contig_id = ""; + my $fset_id = ""; + + if (defined($contig->id)) { + $contig_id = $contig->id; + } + if (defined($fset->id)) { + $fset_id = $fset->id; + } + + $translation->temporary_id($contig_id . "." . $fset_id); + $translation->start (1); + $translation->end ($exons[$#exons]->end - $exons[$#exons]->start + 1); + $translation->start_Exon($exons[0]); + $translation->end_Exon ($exons[$#exons]); + $transcript->translation($translation); + + # print STDERR "Phase $startphase " . $transcript->translate->seq . "\n"; + + $startphase++; + } + #print "finshed fset2transcript_3frame\n"; + return @transcripts; +} + + +sub fset2transcript_with_seq { + my ($genscan,$seq)=@_; + + + unless ($genscan->isa ("Bio::EnsEMBL::SeqFeatureI")) + {print "$genscan must be Bio::EnsEMBL::SeqFeatureI\n";} + unless ($seq->isa ("Bio::PrimarySeqI") || $seq->isa ("Bio::SeqI")) + {print "$seq must be Bio::SeqI or a Bio::PrimarySeqI\n";} + + #print STDERR "running fset2transcript\n"; + my $transcript = new Bio::EnsEMBL::Transcript; + $transcript->temporary_id($seq->id . "." . $genscan->seqname); + + my @exons; + my $count= 1; + + foreach my $f ($genscan->sub_SeqFeature) { + + my $exon = new Bio::EnsEMBL::Exon; + $exon->contig ($seq); + $exon->start ($f->start); + $exon->end ($f->end ); + $exon->strand ($f->strand); + $exon->phase ($f->phase); + $exon->end_phase( ($exon->phase + $exon->length)%3 ); + #$exon->score ($f->score); + #print STDERR "contig is a = ".$seq."\n"; + $exon->slice($seq); + + push(@exons,$exon); + $count++; + + } + + foreach my $exon (@exons) { + + $transcript->add_Exon($exon); + + + } + return $transcript; + +} + + + +1;