Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Utils/SeqDumper.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/SeqDumper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1121 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::SeqDumper + +=head1 SYNOPSIS + + $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new(); + + # don't dump snps or repeats + $seq_dumper->disable_feature_type('repeat'); + $seq_dumper->disable_feature_type('variation'); + + # dump EMBL format to STDOUT + $seq_dumper->dump( $slice, 'EMBL' ); + + # dump GENBANK format to a file + $seq_dumper->dump( $slice, 'GENBANK', 'out.genbank' ); + + # dump FASTA format to a file + $seq_dumper->dump( $slice, 'FASTA', 'out.fasta' ); + +=head1 DESCRIPTION + +A relatively simple and lite-weight flat file dumper for Ensembl slices. +The memory efficiency could be improved and this is currently not very +good for dumping very large sequences such as whole chromosomes. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Utils::SeqDumper; + +use strict; + +use IO::File; +use vars qw(@ISA); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +#keys must be uppercase +my $DUMP_HANDLERS = + { 'FASTA' => \&dump_fasta, + 'EMBL' => \&dump_embl, + 'GENBANK' => \&dump_genbank }; + +my @COMMENTS = + ('This sequence was annotated by the Ensembl system. Please visit ' . + 'the Ensembl web site, http://www.ensembl.org/ for more information.', + + 'All feature locations are relative to the first (5\') base ' . + 'of the sequence in this file. The sequence presented is '. + 'always the forward strand of the assembly. Features ' . + 'that lie outside of the sequence contained in this file ' . + 'have clonal location coordinates in the format: ' . + '<clone accession>.<version>:<start>..<end>', + + 'The /gene indicates a unique id for a gene, /note="transcript_id=..."' . + ' a unique id for a transcript, /protein_id a unique id for a peptide ' . + 'and note="exon_id=..." a unique id for an exon. These ids are ' . + 'maintained wherever possible between versions.', + + 'All the exons and transcripts in Ensembl are confirmed by ' . + 'similarity to either protein or cDNA sequences.'); + + +=head2 new + + Arg [1] : none + Example : $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new; + Description: Creates a new SeqDumper + Returntype : Bio::EnsEMBL::Utils::SeqDumper + Exceptions : none + Caller : general + +=cut + +sub new { + my ($caller, $slice, $params) = @_; + + my $class = ref($caller) || $caller; + + my $feature_types = {'gene' => 1, + 'genscan' => 1, + 'repeat' => 1, + 'similarity' => 1, + 'variation' => 1, + 'contig' => 1, + 'marker' => 1, + 'estgene' => 0, + 'vegagene' => 0}; + + my $self = bless {'feature_types' => $feature_types}, $class; + + foreach my $p (sort keys %{$params || {}}) { + $self->{$p} = $params->{$p}; + } + + return $self; +} + + + +=head2 enable_feature_type + + Arg [1] : string $type + Example : $seq_dumper->enable_feature_type('similarity'); + Description: Enables the dumping of a specific type of feature + Returntype : none + Exceptions : warn if invalid feature type is passed, + thrown if no feature type is passed + Caller : general + +=cut + +sub enable_feature_type { + my ($self, $type) = @_; + + $type || throw("type arg is required"); + + if(exists($self->{'feature_types'}->{$type})) { + $self->{'feature_types'}->{$type} = 1; + } else { + warning("unknown feature type '$type'\n" . + "valid types are: " . join(',', keys %{$self->{'feature_types'}})); + } +} + + + +=head2 attach_database + + Arg [1] : string name + Arg [2] : Bio::EnsEMBL::DBSQL::DBAdaptor + Example : $seq_dumper->attach_database('estgene', $estgene_db); + Description: Attaches a database to the seqdumper that can be used to + dump data which is external to the ensembl core database. + Currently this is necessary to dump est genes and vega genes + Returntype : none + Exceptions : thrown if incorrect argument is supplied + Caller : general + +=cut + +sub attach_database { + my ($self, $name, $db) = @_; + + $name || throw("name arg is required"); + unless($db && ref($db) && $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) { + throw("db arg must be a Bio::EnsEMBL::DBSQL::DBAdaptor not a [$db]"); + } + + $self->{'attached_dbs'}->{$name} = $db; +} + + + +=head2 get_database + + Arg [1] : string $name + Example : $db = $seq_dumper->get_database('vega'); + Description: Retrieves a database that has been attached to the + seqdumper via the attach database call. + Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor + Exceptions : thrown if incorrect argument is supplied + Caller : dump_feature_table + +=cut + +sub get_database { + my ($self, $name) = @_; + + $name || throw("name arg is required"); + + return $self->{'attached_dbs'}->{$name}; +} + + + +=head2 remove_database + + Arg [1] : string $name + Example : $db = $seq_dumper->remove_database('estgene'); + Description: Removes a database that has been attached to the seqdumper + via the attach database call. The database that is removed + is returned (or undef if it did not exist). + Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor + Exceptions : thrown if incorrect argument is supplied + Caller : general + +=cut + +sub remove_database { + my ($self, $name) = @_; + + $name || throw("name arg is required"); + + if(exists $self->{'attached_dbs'}->{$name}) { + return delete $self->{'attached_dbs'}->{$name}; + } + + return undef; +} + + +=head2 disable_feature_type + + Arg [1] : string $type + Example : $seq_dumper->disable_feature_type('genes'); + Description: Disables the dumping of a specific type of feature + Returntype : none + Exceptions : warn if an invalid feature type is passed, + thrown if no feature type is passed + Caller : general + +=cut + +sub disable_feature_type { + my ($self, $type) = @_; + + $type || throw("type arg is required"); + + if(exists($self->{'feature_types'}->{$type})) { + $self->{'feature_types'}->{$type} = 0; + } else { + warning("unknown feature type '$type'\n" . + "valid types are: " . join(',', keys %{$self->{'feature_types'}})); + } +} + + + +=head2 is_enabled + + Arg [1] : string $type + Example : do_something() if($seq_dumper->is_enabled('gene')); + Description: checks if a specific feature type is enabled + Returntype : none + Exceptions : warning if invalid type is passed, + thrown if no type is passed + Caller : general + +=cut + +sub is_enabled { + my ($self, $type) = @_; + + $type || throw("type arg is required"); + + if(exists($self->{'feature_types'}->{$type})) { + return $self->{'feature_types'}->{$type}; + } else { + warning("unknown feature type '$type'\n" . + "valid types are: " . join(',', keys %{$self->{'feature_types'}})); + } +} + + +=head2 dump + + Arg [1] : Bio::EnsEMBL::Slice slice + The slice to dump + Arg [2] : string $format + The name of the format to dump + Arg [3] : (optional) $outfile + The name of the file to dump to. If no file is specified STDOUT + is used + Arg [4] : (optional) $seq + Sequence to dump + Arg [4] : (optional) $no_append + Default action is to open the file in append mode. This will + turn that mode off + Example : $seq_dumper->dump($slice, 'EMBL'); + Description: Dumps a region of a genome specified by the slice argument into + an outfile of the format $format + Returntype : none + Exceptions : thrown if slice or format args are not supplied + Caller : general + +=cut + + +sub dump { + my ($self, $slice, $format, $outfile, $seq, $no_append) = @_; + + $format || throw("format arg is required"); + $slice || throw("slice arg is required"); + + my $dump_handler = $DUMP_HANDLERS->{uc($format)}; + + unless($dump_handler) { + throw("No dump handler is defined for format $format\n"); + } + + + my $FH = IO::File->new;; + if($outfile) { + my $mode = ($no_append) ? '>' : '>>'; + $FH->open("${mode}${outfile}") or throw("Could not open file $outfile: $!"); + } else { + $FH = \*STDOUT; + #mod_perl did not like the following + #$FH->fdopen(fileno(STDOUT), "w") + # or throw("Could not open currently selected output filehandle " . + # "for writing"); + } + + &$dump_handler($self, $slice, $FH, $seq); + + $FH->close if ($outfile); #close if we were writing to a file +} + +=head2 dump_embl + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : IO::File $FH + Arg [3] : optional sequence string + Example : $seq_dumper->dump_embl($slice, $FH); + Description: Dumps an EMBL flat file to an open file handle + Returntype : none + Exceptions : none + Caller : dump + +=cut + +sub dump_embl { + my $self = shift; + my $slice = shift; + my $FH = shift; + my $SEQ = shift; + + my $len = $slice->length; + + my $version; + my $acc; + + my $cs = $slice->coord_system(); + my $name_str = $cs->name() . ' ' . $slice->seq_region_name(); + $name_str .= ' ' . $cs->version if($cs->version); + + my $start = $slice->start; + my $end = $slice->end; + + #determine if this slice is the entire seq region + #if it is then we just use the name as the id + my $slice_adaptor = $slice->adaptor(); + my $full_slice = + $slice->adaptor->fetch_by_region($cs->name, + $slice->seq_region_name, + undef,undef,undef, + $cs->version); + + + my $entry_name = $slice->seq_region_name(); + + + + if($full_slice->name eq $slice->name) { + $name_str .= ' full sequence'; + $acc = $slice->seq_region_name(); + my @acc_ver = split(/\./, $acc); + if(@acc_ver == 2) { + $acc = $acc_ver[0]; + $version = $acc_ver[0] . '.'. $acc_ver[1]; + } elsif(@acc_ver == 1 && $cs->version()) { + $version = $acc . '.'. $cs->version(); + } else { + $version = $acc; + } + } else { + $name_str .= ' partial sequence'; + $acc = $slice->name(); + $version = $acc; + } + + $acc = $slice->name(); + + + + #line breaks are allowed near the end of the line on ' ', "\t", "\n", ',' + $: = (" \t\n-,"); + + ############# + # dump header + ############# + + my $EMBL_HEADER = +'@< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~ +'; + + #ID and moltype + # HTG = High Throughput Genome division, probably most suitable + # and it would be hard to come up with another appropriate division + # that worked for all organisms (e.g. plants are in PLN but human is + # in HUM). + my $VALUE = "$entry_name standard; DNA; HTG; $len BP."; + $self->write($FH, $EMBL_HEADER, 'ID', $VALUE); + $self->print( $FH, "XX\n" ); + + #Accession + $self->write($FH, $EMBL_HEADER, 'AC', $acc); + $self->print( $FH, "XX\n" ); + + #Version + $self->write($FH, $EMBL_HEADER, 'SV', $version); + $self->print( $FH, "XX\n" ); + + #Date + $self->write($FH, $EMBL_HEADER, 'DT', $self->_date_string); + $self->print( $FH, "XX\n" ); + + my $meta_container = $slice->adaptor()->db()->get_MetaContainer(); + + #Description + $self->write($FH, $EMBL_HEADER, 'DE', $meta_container->get_scientific_name() . + " $name_str $start..$end annotated by Ensembl"); + $self->print( $FH, "XX\n" ); + + #key words + $self->write($FH, $EMBL_HEADER, 'KW', '.'); + $self->print( $FH, "XX\n" ); + + #Species + my $species_name = $meta_container->get_scientific_name(); + if(my $cn = $meta_container->get_common_name()) { + $species_name .= " ($cn)"; + } + + $self->write($FH, $EMBL_HEADER, 'OS', $species_name); + + #Classification + my $cls = $meta_container->get_classification(); + $self->write($FH, $EMBL_HEADER, 'OC', join('; ', reverse(@{$cls})) . '.'); + $self->print( $FH, "XX\n" ); + + #References (we are not dumping refereneces) + + #Database References (we are not dumping these) + + #comments + foreach my $comment (@COMMENTS) { + $self->write($FH, $EMBL_HEADER, 'CC', $comment); + $self->print( $FH, "XX\n" ); + } + + #################### + #DUMP FEATURE TABLE + #################### + $self->print( $FH, "FH Key Location/Qualifiers\n" ); + + my $FEATURE_TABLE = +'FT ^<<<<<<<<<<<<<<<^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~ +'; + $self->_dump_feature_table($slice, $FH, $FEATURE_TABLE); + + #write an XX after the feature tables + $self->print( $FH, "XX\n" ); + + ################### + #DUMP SEQUENCE + ################### + + if(!defined($SEQ)){ + $SEQ = $slice->seq(); + } +# my $SEQ = $slice->seq(); + my $length = length($SEQ); + my $a_count = $SEQ =~ tr/aA/aA/; + my $c_count = $SEQ =~ tr/cC/cC/; + my $t_count = $SEQ =~ tr/tT/tT/; + my $g_count = $SEQ =~ tr/gG/gG/; + my $other_count = $length - $a_count - $c_count - $t_count - $g_count; + + my $value = "Sequence $length BP; $a_count A; $c_count C; " . + "$g_count G; $t_count T; $other_count other;"; + $self->print($FH, 'SQ '.$value."\n"); + + $self->write_embl_seq($FH, \$SEQ); + + + $self->print( $FH, "//\n" ); + + # Set formatting back to normal + $: = " \n-"; +} + + + + +=head2 dump_genbank + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : IO::File $FH + Example : $seq_dumper->dump_genbank($slice, $FH); + Description: Dumps a GENBANK flat file to an open file handle + Returntype : none + Exceptions : none + Caller : dump + +=cut + +sub dump_genbank { + my ($self, $slice, $FH, $SEQ) = @_; + + #line breaks are allowed near the end of the line on ' ', "\t", "\n", ',' + $: = " \t\n-,"; + + my $GENBANK_HEADER = +'^<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ +'; + + my $GENBANK_SUBHEADER = +' ^<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ +'; + + my $GENBANK_FT = +' ^<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ +'; + + my $version; + my $acc; + + my $cs = $slice->coord_system(); + + my $name_str = $cs->name() . ' ' . $slice->seq_region_name(); + + $name_str .= ' ' . $cs->version if($cs->version); + + #determine if this slice is the entire seq region + #if it is then we just use the name as the id + my $slice_adaptor = $slice->adaptor(); + my $full_slice = + $slice->adaptor->fetch_by_region($cs->name, + $slice->seq_region_name, + undef,undef,undef, + $cs->version); + + + my $entry_name = $slice->seq_region_name(); + + if($full_slice->name eq $slice->name) { + $name_str .= ' full sequence'; + $acc = $slice->seq_region_name(); + my @acc_ver = split(/\./, $acc); + if(@acc_ver == 2) { + $acc = $acc_ver[0]; + $version = $acc_ver[0] . $acc_ver[1]; + } elsif(@acc_ver == 1 && $cs->version()) { + $version = $acc . $cs->version(); + } else { + $version = $acc; + } + } else { + $name_str .= ' partial sequence'; + $acc = $slice->name(); + $version = $acc; + } + + $acc = $slice->name(); # to keep format consistent for all + + my $length = $slice->length; + my $start = $slice->start(); + my $end = $slice->end(); + + my $date = $self->_date_string; + + my $meta_container = $slice->adaptor()->db()->get_MetaContainer(); + + #LOCUS + my $tag = 'LOCUS'; + my $value = "$entry_name $length bp DNA HTG $date"; + $self->write($FH, $GENBANK_HEADER, $tag, $value); + + #DEFINITION + $tag = "DEFINITION"; + $value = $meta_container->get_scientific_name() . + " $name_str $start..$end reannotated via EnsEMBL"; + $self->write($FH, $GENBANK_HEADER, $tag, $value); + + #ACCESSION + $self->write($FH, $GENBANK_HEADER, 'ACCESSION', $acc); + + #VERSION + $self->write($FH, $GENBANK_HEADER, 'VERSION', $version); + + # KEYWORDS + $self->write($FH, $GENBANK_HEADER, 'KEYWORDS', '.'); + + # SOURCE + my $common_name = $meta_container->get_common_name(); + $common_name = $meta_container->get_scientific_name() unless $common_name; + $self->write($FH, $GENBANK_HEADER, 'SOURCE', $common_name); + + #organism + my @cls = $meta_container->get_classification(); + shift @cls; + $self->write($FH, $GENBANK_SUBHEADER, 'ORGANISM', $meta_container->get_scientific_name()); + $self->write($FH, $GENBANK_SUBHEADER, '', join('; ', reverse @cls) . "."); + + #refereneces + + #comments + foreach my $comment (@COMMENTS) { + $self->write($FH, $GENBANK_HEADER, 'COMMENT', $comment); + } + + #################### + # DUMP FEATURE TABLE + #################### + $self->print( $FH, "FEATURES Location/Qualifiers\n" ); + $self->_dump_feature_table($slice, $FH, $GENBANK_FT); + + #################### + # DUMP SEQUENCE + #################### + + if(!defined($SEQ)){ + $SEQ = $slice->seq(); + } +# my $SEQ = $slice->seq(); + my $a_count = $SEQ =~ tr/aA/aA/; + my $c_count = $SEQ =~ tr/cC/cC/; + my $t_count = $SEQ =~ tr/tT/tT/; + my $g_count = $SEQ =~ tr/gG/gG/; + my $bp_length = length($SEQ); + my $other_count = $bp_length - $a_count - $c_count - $t_count - $g_count; + + $tag = 'BASE COUNT'; + $value = "$a_count a $c_count c $g_count g $t_count t"; + $value .= " $other_count n" if($other_count); + $self->print($FH, qq{$tag $value\n}); + $self->print( $FH, "ORIGIN\n" ); + + $self->write_genbank_seq($FH, \$SEQ); + + $self->print( $FH, "//\n" ); + + # Set formatting back to normal + $: = " \n-"; +} + + + +=head2 _dump_feature_table + + Arg [1] : Bio::EnsEMBL::Slice slice + Example : none + Description: Helper method used to dump feature tables used in EMBL, FASTA, + GENBANK. Assumes formating of file handle has been setup + already to use $FEAT and $VALUE values. + Returntype : none + Exceptions : none + Caller : internal + +=cut + +sub _dump_feature_table { + my $self = shift; + my $slice = shift; + my $FH = shift; + my $FORMAT = shift; + + #use only the core database to dump features (except for bloody snps) + my $lite = $slice->adaptor->db->remove_db_adaptor('lite'); + + my $meta = $slice->adaptor->db->get_MetaContainer; + + #lump file handle and format string together for simpler method calls + my @ff = ($FH, $FORMAT); + my $value; + + #source + my $classification = join(', ', $meta->get_classification()); + $self->write(@ff,'source', "1.." . $slice->length()); + $self->write(@ff,'' , '/organism="'.$meta->get_scientific_name(). '"'); + $self->write(@ff,'' , '/db_xref="taxon:'.$meta->get_taxonomy_id().'"'); + + # + # Transcripts & Genes + # + my @gene_slices; + if($self->is_enabled('gene')) { + push @gene_slices, $slice; + } + + # Retrieve slices of other database where we need to pull genes from + + my $gene_dbs = {'vegagene' => 'vega', + 'estgene' => 'estgene'}; + + foreach my $gene_type (keys %$gene_dbs) { + if($self->is_enabled($gene_type)) { + my $db = $self->get_database($gene_dbs->{$gene_type}); + if($db) { + my $sa = $db->get_SliceAdaptor(); + push @gene_slices, $sa->fetch_by_name($slice->name()); + } else { + warning("A [". $gene_dbs->{$gene_type} ."] database must be " . + "attached to this SeqDumper\n(via a call to " . + "attach_database) to retrieve genes of type [$gene_type]"); + } + } + } + + foreach my $gene_slice (@gene_slices) { + my @genes = @{$gene_slice->get_all_Genes(undef,undef, 1)}; + while(my $gene = shift @genes) { + $value = $self->features2location( [$gene] ); + $self->write( @ff, 'gene', $value ); + $self->write( @ff, "", '/gene='.$gene->stable_id() ); + + + if(defined($gene->display_xref)){ + $self->write( @ff, "",'/locus_tag="'.$gene->display_xref->display_id.'"'); + } + my $desc = $gene->description; + if(defined($desc) and $desc ne ""){ + $desc =~ s/\"//; + $self->write( @ff, "", '/note="'.$gene->description.'"'); + } + + + + foreach my $transcript (@{$gene->get_all_Transcripts}) { + my $translation = $transcript->translation; + + # normal transcripts get dumped differently than pseudogenes + if($translation) { + #normal transcript + $value = $self->features2location($transcript->get_all_Exons); + $self->write(@ff, 'mRNA', $value); + $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"'); + $self->write(@ff,'' + ,'/note="transcript_id='.$transcript->stable_id().'"'); + + # ...and a CDS section + $value = + $self->features2location($transcript->get_all_translateable_Exons); + $self->write(@ff,'CDS', $value); + my $codon_start = $self->transcript_to_codon_start($transcript); + $self->write(@ff,'' , qq{/codon_start="${codon_start}"}) if $codon_start > 1; + $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"'); + $self->write(@ff,'' , '/protein_id="'.$translation->stable_id().'"'); + $self->write(@ff,'' ,'/note="transcript_id='.$transcript->stable_id().'"'); + + foreach my $dbl (@{$transcript->get_all_DBLinks}) { + $value = '/db_xref="'.$dbl->dbname().':'.$dbl->display_id().'"'; + $self->write(@ff, '', $value); + } + + $value = '/translation="'.$transcript->translate()->seq().'"'; + $self->write(@ff, '', $value); + } else { + #pseudogene + $value = $self->features2location($transcript->get_all_Exons); + $self->write(@ff, 'misc_RNA', $value); + $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"'); + foreach my $dbl (@{$transcript->get_all_DBLinks}) { + $value = '/db_xref="'.$dbl->dbname().':'.$dbl->primary_id().'"'; + $self->write(@ff, '', $value); + } + $self->write(@ff,'' , '/note="'.$transcript->biotype().'"'); + $self->write(@ff,'' + ,'/note="transcript_id='.$transcript->stable_id().'"'); + } + } + } + + # exons + foreach my $gene (@{$gene_slice->get_all_Genes(undef,undef,1)}) { + foreach my $exon (@{$gene->get_all_Exons}) { + $self->write(@ff,'exon', $self->features2location([$exon])); + $self->write(@ff,'' , '/note="exon_id='.$exon->stable_id().'"'); + } + } + } + + # + # genscans + # + if($self->is_enabled('genscan')) { + my @genscan_exons; + my @transcripts = @{$slice->get_all_PredictionTranscripts(undef,1)}; + while(my $transcript = shift @transcripts) { + my $exons = $transcript->get_all_Exons(); + push @genscan_exons, @$exons; + $self->write(@ff, 'mRNA', $self->features2location($exons)); + $self->write(@ff, '', '/product="'.$transcript->translate()->seq().'"'); + $self->write(@ff, '', '/note="identifier='.$transcript->stable_id.'"'); + $self->write(@ff, '', '/note="Derived by automated computational' . + ' analysis using gene prediction method:' . + $transcript->analysis->logic_name . '"'); + } + } + + # + # snps + # + if($self->is_enabled('variation') && $slice->can('get_all_VariationFeatures')) { +# $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite; + + my @variations = @{$slice->get_all_VariationFeatures()}; + while(my $snp = shift @variations) { + my $ss = $snp->start; + my $se = $snp->end; + #skip snps that hang off edge of slice + next if($ss < 1 || $se > $slice->length); + + $self->write(@ff, 'variation', "$ss..$se"); + $self->write(@ff, '' , '/replace="'.$snp->allele_string.'"'); + #$self->write(@ff, '' , '/evidence="'.$snp->status.'"'); + my $rs_id = $snp->variation_name(); + my $db = $snp->source(); +# foreach my $link ($snp->each_DBLink) { +# my $id = $link->primary_id; +# my $db = $link->database; + $self->write(@ff, '', "/db_xref=\"$db:$rs_id\""); +# } + } + +# $slice->adaptor->db->remove_db_adaptor('lite') if $lite; + } + + # + # similarity features + # + if($self->is_enabled('similarity')) { + foreach my $sim (@{$slice->get_all_SimilarityFeatures}) { + $self->write(@ff, 'misc_feature', $self->features2location([$sim])); + $self->write(@ff, '' , '/note="match: '.$sim->hseqname. + ' : '.$sim->hstart.'..'.$sim->hend.'('.$sim->hstrand.')"'); + } + } + + # + # repeats + # + if($self->is_enabled('repeat')) { + my @rfs = @{$slice->get_all_RepeatFeatures()}; + + while(my $repeat = shift @rfs) { + $self->write(@ff, 'repeat_region', $self->features2location([$repeat])); + $self->write(@ff, '' , '/note="' . $repeat->repeat_consensus->name. + ' repeat: matches ' . $repeat->hstart.'..'.$repeat->hend . + '('.$repeat->hstrand.') of consensus"'); + } + + } + + # + # markers + # + if($self->is_enabled('marker') && $slice->can('get_all_MarkerFeatures')) { + my @markers = @{$slice->get_all_MarkerFeatures()}; + while(my $mf = shift @markers) { + $self->write(@ff, 'STS', $self->features2location([$mf])); + if($mf->marker->display_MarkerSynonym) { + $self->write(@ff, '' , '/standard_name="' . + $mf->marker->display_MarkerSynonym->name . '"'); + } + + + #grep out synonyms without a source + my @synonyms = @{$mf->marker->get_all_MarkerSynonyms}; + @synonyms = grep {$_->source } @synonyms; + foreach my $synonym (@synonyms) { + $self->write(@ff, '', '/db_xref="'.$synonym->source. + ':'.$synonym->name.'"'); + } + $self->write(@ff, '', '/note="map_weight='.$mf->map_weight.'"'); + } + } + + # + # contigs + # + if($self->is_enabled('contig')) { + foreach my $segment (@{$slice->project('seqlevel')}) { + my ($start, $end, $slice) = @$segment; + $self->write(@ff, 'misc_feature', + $start .'..'. $end); + $self->write(@ff, '', '/note="contig '.$slice->seq_region_name . + ' ' . $slice->start . '..' . $slice->end . + '(' . $slice->strand . ')"'); + } + } + + $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite; + +} + +# /codon_start= is the first base to start translating from. This maps +# Ensembl start exon phase to this concept. Here we present the usage +# of phase in this concept where each row shows the sequence the +# spliced_seq() method will return + +# 123456789 +# ATTATGACG +# Phase == 0 ...+++### codon_start=0 // start from 1st A +# Phase == 1 ..+++### codon_start=3 // start from 2nd A (base 3 in the given spliced sequence) +# Phase == 2 .+++### codon_start=2 // start from 2nd A (base 2 in the spliced sequence) +# +# In the case of the final 2 phases we will generate a X codon +# + +sub transcript_to_codon_start { + my ($self, $transcript) = @_; + my $start_phase = $transcript->start_Exon()->phase(); + return ( $start_phase == 1 ) ? 3 : + ( $start_phase == 2 ) ? 2 : + ( $start_phase == 0 ) ? 1 : + -1; +} + + +=head2 dump_fasta + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : IO::File $FH + Example : $seq_dumper->dump_fasta($slice, $FH); + Description: Dumps an FASTA flat file to an open file handle + Returntype : none + Exceptions : none + Caller : dump + +=cut + +sub dump_fasta { + my $self = shift; + my $slice = shift; + my $FH = shift; + + my $id = $slice->seq_region_name; + my $seqtype = 'dna'; + my $idtype = $slice->coord_system->name; + my $location = $slice->name; + my $start = 1; + my $end = $slice->length(); + + my $header = ">$id $seqtype:$idtype $location\n"; + $self->print( $FH, $header ); + + #set the formatting to FASTA + my $FORMAT = '^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +'; + + #chunk the sequence in 60kb chunks to use less memory + my $cur = $start; + while($cur <= $end) { + my $to = $cur + 59_999; + $to = $end if($to > $end); + my $seq = $slice->subseq($cur, $to); + $cur = $to + 1; + $self->write($FH, $FORMAT, $seq); + } +} + + + +=head2 features2location + + Arg [1] : listref of Bio::EnsEMBL::SeqFeatures + Example : $location = $self->features2location(\@features); + Description: Constructs an EMBL location string from a list of features + Returntype : string + Exceptions : none + Caller : internal + +=cut + +sub features2location { + my $self = shift; + my $features = shift; + + my @join = (); + + foreach my $f (@$features) { + my $slice = $f->slice; + my $start = $f->start(); + my $end = $f->end(); + my $strand = $f->strand(); + + if($start >= 1 && $end <= $slice->length) { + #this feature in on a slice and doesn't lie outside the boundary + + if($strand == 1) { + push @join, "$start..$end"; + } else { + push @join, "complement($start..$end)"; + } + } else { + my @fs = (); + #this feature is outside the boundary of the dump, + # yet implemented and 'seqlevel' is guaranteed to be 1step + my $projection = $f->project('seqlevel'); + foreach my $segment (@$projection) { + my $slice = $segment->[2]; + my $slc_start = $slice->start(); + my $slc_end = $slice->end(); + my $seq_reg = $slice->seq_region_name(); + if($slice->strand == 1) { + push @join, "$seq_reg:$slc_start..$slc_end"; + } else { + push @join, "complement($seq_reg:$slc_start..$slc_end)"; + } + } + } + } + + my $out = join ',', @join; + + if(scalar @join > 1) { + $out = "join($out)"; + } + + return $out; +} + + +sub _date_string { + my $self = shift; + + my ($sec, $min, $hour, $mday,$mon, $year) = localtime(time()); + + my $month = ('JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', + 'AUG', 'SEP', 'OCT', 'NOV', 'DEC')[$mon]; + $year += 1900; + + return "$mday-$month-$year"; +} + + +sub write { + my ($self, $FH, $FORMAT, @values) = @_; + + #while the last value still contains something + while(defined($values[-1]) and $values[-1] ne '') { + formline($FORMAT, @values); + $self->print( $FH, $^A ); + $^A = ''; + } +} + +sub write_genbank_seq { + my $self = shift; + my $FH = shift; + my $seq = shift; + my $base_total = shift; + + $base_total ||= 0; + + my $GENBANK_SEQ = +'@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~ +'; + + my $total = -59 + $base_total; + #keep track of total and print lines of 60 bases with spaces every 10bp + while($$seq) { + $total += 60; + formline($GENBANK_SEQ,$total, $$seq, $$seq, $$seq, $$seq, $$seq, $$seq); + $self->print( $FH, $^A ); + $^A = ''; + } +} + +sub write_embl_seq { + my $self = shift; + my $FH = shift; + my $seq = shift; + my $base_total = shift; + + $base_total ||= 0; + + my $EMBL_SEQ = +' ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<@>>>>>>>>>~ +'; + #keep track of total and print lines of 60 bases with spaces every 10bp + my $length = length($$seq); + my $total = $length - $base_total; + while($$seq) { + $total -= 60; + $total = 0 if($total < 0); + formline($EMBL_SEQ, + $$seq, $$seq, $$seq, $$seq, $$seq, $$seq, + $length - $total); + $self->print( $FH, $^A ); + $^A = ''; + } +} + +sub print { + my( $self, $FH, $string ) = @_; + if(!print $FH $string){ + print STDERR "Problem writing to disk\n"; + print STDERR "the string is $string\n"; + die "Could not write to file handle"; + } +} + +1;