comparison variant_effect_predictor/Bio/EnsEMBL/Utils/SeqDumper.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Utils::SeqDumper
24
25 =head1 SYNOPSIS
26
27 $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new();
28
29 # don't dump snps or repeats
30 $seq_dumper->disable_feature_type('repeat');
31 $seq_dumper->disable_feature_type('variation');
32
33 # dump EMBL format to STDOUT
34 $seq_dumper->dump( $slice, 'EMBL' );
35
36 # dump GENBANK format to a file
37 $seq_dumper->dump( $slice, 'GENBANK', 'out.genbank' );
38
39 # dump FASTA format to a file
40 $seq_dumper->dump( $slice, 'FASTA', 'out.fasta' );
41
42 =head1 DESCRIPTION
43
44 A relatively simple and lite-weight flat file dumper for Ensembl slices.
45 The memory efficiency could be improved and this is currently not very
46 good for dumping very large sequences such as whole chromosomes.
47
48 =head1 METHODS
49
50 =cut
51
52
53 package Bio::EnsEMBL::Utils::SeqDumper;
54
55 use strict;
56
57 use IO::File;
58 use vars qw(@ISA);
59
60 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
61
62 #keys must be uppercase
63 my $DUMP_HANDLERS =
64 { 'FASTA' => \&dump_fasta,
65 'EMBL' => \&dump_embl,
66 'GENBANK' => \&dump_genbank };
67
68 my @COMMENTS =
69 ('This sequence was annotated by the Ensembl system. Please visit ' .
70 'the Ensembl web site, http://www.ensembl.org/ for more information.',
71
72 'All feature locations are relative to the first (5\') base ' .
73 'of the sequence in this file. The sequence presented is '.
74 'always the forward strand of the assembly. Features ' .
75 'that lie outside of the sequence contained in this file ' .
76 'have clonal location coordinates in the format: ' .
77 '<clone accession>.<version>:<start>..<end>',
78
79 'The /gene indicates a unique id for a gene, /note="transcript_id=..."' .
80 ' a unique id for a transcript, /protein_id a unique id for a peptide ' .
81 'and note="exon_id=..." a unique id for an exon. These ids are ' .
82 'maintained wherever possible between versions.',
83
84 'All the exons and transcripts in Ensembl are confirmed by ' .
85 'similarity to either protein or cDNA sequences.');
86
87
88 =head2 new
89
90 Arg [1] : none
91 Example : $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new;
92 Description: Creates a new SeqDumper
93 Returntype : Bio::EnsEMBL::Utils::SeqDumper
94 Exceptions : none
95 Caller : general
96
97 =cut
98
99 sub new {
100 my ($caller, $slice, $params) = @_;
101
102 my $class = ref($caller) || $caller;
103
104 my $feature_types = {'gene' => 1,
105 'genscan' => 1,
106 'repeat' => 1,
107 'similarity' => 1,
108 'variation' => 1,
109 'contig' => 1,
110 'marker' => 1,
111 'estgene' => 0,
112 'vegagene' => 0};
113
114 my $self = bless {'feature_types' => $feature_types}, $class;
115
116 foreach my $p (sort keys %{$params || {}}) {
117 $self->{$p} = $params->{$p};
118 }
119
120 return $self;
121 }
122
123
124
125 =head2 enable_feature_type
126
127 Arg [1] : string $type
128 Example : $seq_dumper->enable_feature_type('similarity');
129 Description: Enables the dumping of a specific type of feature
130 Returntype : none
131 Exceptions : warn if invalid feature type is passed,
132 thrown if no feature type is passed
133 Caller : general
134
135 =cut
136
137 sub enable_feature_type {
138 my ($self, $type) = @_;
139
140 $type || throw("type arg is required");
141
142 if(exists($self->{'feature_types'}->{$type})) {
143 $self->{'feature_types'}->{$type} = 1;
144 } else {
145 warning("unknown feature type '$type'\n" .
146 "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
147 }
148 }
149
150
151
152 =head2 attach_database
153
154 Arg [1] : string name
155 Arg [2] : Bio::EnsEMBL::DBSQL::DBAdaptor
156 Example : $seq_dumper->attach_database('estgene', $estgene_db);
157 Description: Attaches a database to the seqdumper that can be used to
158 dump data which is external to the ensembl core database.
159 Currently this is necessary to dump est genes and vega genes
160 Returntype : none
161 Exceptions : thrown if incorrect argument is supplied
162 Caller : general
163
164 =cut
165
166 sub attach_database {
167 my ($self, $name, $db) = @_;
168
169 $name || throw("name arg is required");
170 unless($db && ref($db) && $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) {
171 throw("db arg must be a Bio::EnsEMBL::DBSQL::DBAdaptor not a [$db]");
172 }
173
174 $self->{'attached_dbs'}->{$name} = $db;
175 }
176
177
178
179 =head2 get_database
180
181 Arg [1] : string $name
182 Example : $db = $seq_dumper->get_database('vega');
183 Description: Retrieves a database that has been attached to the
184 seqdumper via the attach database call.
185 Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
186 Exceptions : thrown if incorrect argument is supplied
187 Caller : dump_feature_table
188
189 =cut
190
191 sub get_database {
192 my ($self, $name) = @_;
193
194 $name || throw("name arg is required");
195
196 return $self->{'attached_dbs'}->{$name};
197 }
198
199
200
201 =head2 remove_database
202
203 Arg [1] : string $name
204 Example : $db = $seq_dumper->remove_database('estgene');
205 Description: Removes a database that has been attached to the seqdumper
206 via the attach database call. The database that is removed
207 is returned (or undef if it did not exist).
208 Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
209 Exceptions : thrown if incorrect argument is supplied
210 Caller : general
211
212 =cut
213
214 sub remove_database {
215 my ($self, $name) = @_;
216
217 $name || throw("name arg is required");
218
219 if(exists $self->{'attached_dbs'}->{$name}) {
220 return delete $self->{'attached_dbs'}->{$name};
221 }
222
223 return undef;
224 }
225
226
227 =head2 disable_feature_type
228
229 Arg [1] : string $type
230 Example : $seq_dumper->disable_feature_type('genes');
231 Description: Disables the dumping of a specific type of feature
232 Returntype : none
233 Exceptions : warn if an invalid feature type is passed,
234 thrown if no feature type is passed
235 Caller : general
236
237 =cut
238
239 sub disable_feature_type {
240 my ($self, $type) = @_;
241
242 $type || throw("type arg is required");
243
244 if(exists($self->{'feature_types'}->{$type})) {
245 $self->{'feature_types'}->{$type} = 0;
246 } else {
247 warning("unknown feature type '$type'\n" .
248 "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
249 }
250 }
251
252
253
254 =head2 is_enabled
255
256 Arg [1] : string $type
257 Example : do_something() if($seq_dumper->is_enabled('gene'));
258 Description: checks if a specific feature type is enabled
259 Returntype : none
260 Exceptions : warning if invalid type is passed,
261 thrown if no type is passed
262 Caller : general
263
264 =cut
265
266 sub is_enabled {
267 my ($self, $type) = @_;
268
269 $type || throw("type arg is required");
270
271 if(exists($self->{'feature_types'}->{$type})) {
272 return $self->{'feature_types'}->{$type};
273 } else {
274 warning("unknown feature type '$type'\n" .
275 "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
276 }
277 }
278
279
280 =head2 dump
281
282 Arg [1] : Bio::EnsEMBL::Slice slice
283 The slice to dump
284 Arg [2] : string $format
285 The name of the format to dump
286 Arg [3] : (optional) $outfile
287 The name of the file to dump to. If no file is specified STDOUT
288 is used
289 Arg [4] : (optional) $seq
290 Sequence to dump
291 Arg [4] : (optional) $no_append
292 Default action is to open the file in append mode. This will
293 turn that mode off
294 Example : $seq_dumper->dump($slice, 'EMBL');
295 Description: Dumps a region of a genome specified by the slice argument into
296 an outfile of the format $format
297 Returntype : none
298 Exceptions : thrown if slice or format args are not supplied
299 Caller : general
300
301 =cut
302
303
304 sub dump {
305 my ($self, $slice, $format, $outfile, $seq, $no_append) = @_;
306
307 $format || throw("format arg is required");
308 $slice || throw("slice arg is required");
309
310 my $dump_handler = $DUMP_HANDLERS->{uc($format)};
311
312 unless($dump_handler) {
313 throw("No dump handler is defined for format $format\n");
314 }
315
316
317 my $FH = IO::File->new;;
318 if($outfile) {
319 my $mode = ($no_append) ? '>' : '>>';
320 $FH->open("${mode}${outfile}") or throw("Could not open file $outfile: $!");
321 } else {
322 $FH = \*STDOUT;
323 #mod_perl did not like the following
324 #$FH->fdopen(fileno(STDOUT), "w")
325 # or throw("Could not open currently selected output filehandle " .
326 # "for writing");
327 }
328
329 &$dump_handler($self, $slice, $FH, $seq);
330
331 $FH->close if ($outfile); #close if we were writing to a file
332 }
333
334 =head2 dump_embl
335
336 Arg [1] : Bio::EnsEMBL::Slice
337 Arg [2] : IO::File $FH
338 Arg [3] : optional sequence string
339 Example : $seq_dumper->dump_embl($slice, $FH);
340 Description: Dumps an EMBL flat file to an open file handle
341 Returntype : none
342 Exceptions : none
343 Caller : dump
344
345 =cut
346
347 sub dump_embl {
348 my $self = shift;
349 my $slice = shift;
350 my $FH = shift;
351 my $SEQ = shift;
352
353 my $len = $slice->length;
354
355 my $version;
356 my $acc;
357
358 my $cs = $slice->coord_system();
359 my $name_str = $cs->name() . ' ' . $slice->seq_region_name();
360 $name_str .= ' ' . $cs->version if($cs->version);
361
362 my $start = $slice->start;
363 my $end = $slice->end;
364
365 #determine if this slice is the entire seq region
366 #if it is then we just use the name as the id
367 my $slice_adaptor = $slice->adaptor();
368 my $full_slice =
369 $slice->adaptor->fetch_by_region($cs->name,
370 $slice->seq_region_name,
371 undef,undef,undef,
372 $cs->version);
373
374
375 my $entry_name = $slice->seq_region_name();
376
377
378
379 if($full_slice->name eq $slice->name) {
380 $name_str .= ' full sequence';
381 $acc = $slice->seq_region_name();
382 my @acc_ver = split(/\./, $acc);
383 if(@acc_ver == 2) {
384 $acc = $acc_ver[0];
385 $version = $acc_ver[0] . '.'. $acc_ver[1];
386 } elsif(@acc_ver == 1 && $cs->version()) {
387 $version = $acc . '.'. $cs->version();
388 } else {
389 $version = $acc;
390 }
391 } else {
392 $name_str .= ' partial sequence';
393 $acc = $slice->name();
394 $version = $acc;
395 }
396
397 $acc = $slice->name();
398
399
400
401 #line breaks are allowed near the end of the line on ' ', "\t", "\n", ','
402 $: = (" \t\n-,");
403
404 #############
405 # dump header
406 #############
407
408 my $EMBL_HEADER =
409 '@< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
410 ';
411
412 #ID and moltype
413 # HTG = High Throughput Genome division, probably most suitable
414 # and it would be hard to come up with another appropriate division
415 # that worked for all organisms (e.g. plants are in PLN but human is
416 # in HUM).
417 my $VALUE = "$entry_name standard; DNA; HTG; $len BP.";
418 $self->write($FH, $EMBL_HEADER, 'ID', $VALUE);
419 $self->print( $FH, "XX\n" );
420
421 #Accession
422 $self->write($FH, $EMBL_HEADER, 'AC', $acc);
423 $self->print( $FH, "XX\n" );
424
425 #Version
426 $self->write($FH, $EMBL_HEADER, 'SV', $version);
427 $self->print( $FH, "XX\n" );
428
429 #Date
430 $self->write($FH, $EMBL_HEADER, 'DT', $self->_date_string);
431 $self->print( $FH, "XX\n" );
432
433 my $meta_container = $slice->adaptor()->db()->get_MetaContainer();
434
435 #Description
436 $self->write($FH, $EMBL_HEADER, 'DE', $meta_container->get_scientific_name() .
437 " $name_str $start..$end annotated by Ensembl");
438 $self->print( $FH, "XX\n" );
439
440 #key words
441 $self->write($FH, $EMBL_HEADER, 'KW', '.');
442 $self->print( $FH, "XX\n" );
443
444 #Species
445 my $species_name = $meta_container->get_scientific_name();
446 if(my $cn = $meta_container->get_common_name()) {
447 $species_name .= " ($cn)";
448 }
449
450 $self->write($FH, $EMBL_HEADER, 'OS', $species_name);
451
452 #Classification
453 my $cls = $meta_container->get_classification();
454 $self->write($FH, $EMBL_HEADER, 'OC', join('; ', reverse(@{$cls})) . '.');
455 $self->print( $FH, "XX\n" );
456
457 #References (we are not dumping refereneces)
458
459 #Database References (we are not dumping these)
460
461 #comments
462 foreach my $comment (@COMMENTS) {
463 $self->write($FH, $EMBL_HEADER, 'CC', $comment);
464 $self->print( $FH, "XX\n" );
465 }
466
467 ####################
468 #DUMP FEATURE TABLE
469 ####################
470 $self->print( $FH, "FH Key Location/Qualifiers\n" );
471
472 my $FEATURE_TABLE =
473 'FT ^<<<<<<<<<<<<<<<^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
474 ';
475 $self->_dump_feature_table($slice, $FH, $FEATURE_TABLE);
476
477 #write an XX after the feature tables
478 $self->print( $FH, "XX\n" );
479
480 ###################
481 #DUMP SEQUENCE
482 ###################
483
484 if(!defined($SEQ)){
485 $SEQ = $slice->seq();
486 }
487 # my $SEQ = $slice->seq();
488 my $length = length($SEQ);
489 my $a_count = $SEQ =~ tr/aA/aA/;
490 my $c_count = $SEQ =~ tr/cC/cC/;
491 my $t_count = $SEQ =~ tr/tT/tT/;
492 my $g_count = $SEQ =~ tr/gG/gG/;
493 my $other_count = $length - $a_count - $c_count - $t_count - $g_count;
494
495 my $value = "Sequence $length BP; $a_count A; $c_count C; " .
496 "$g_count G; $t_count T; $other_count other;";
497 $self->print($FH, 'SQ '.$value."\n");
498
499 $self->write_embl_seq($FH, \$SEQ);
500
501
502 $self->print( $FH, "//\n" );
503
504 # Set formatting back to normal
505 $: = " \n-";
506 }
507
508
509
510
511 =head2 dump_genbank
512
513 Arg [1] : Bio::EnsEMBL::Slice
514 Arg [2] : IO::File $FH
515 Example : $seq_dumper->dump_genbank($slice, $FH);
516 Description: Dumps a GENBANK flat file to an open file handle
517 Returntype : none
518 Exceptions : none
519 Caller : dump
520
521 =cut
522
523 sub dump_genbank {
524 my ($self, $slice, $FH, $SEQ) = @_;
525
526 #line breaks are allowed near the end of the line on ' ', "\t", "\n", ','
527 $: = " \t\n-,";
528
529 my $GENBANK_HEADER =
530 '^<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
531 ';
532
533 my $GENBANK_SUBHEADER =
534 ' ^<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
535 ';
536
537 my $GENBANK_FT =
538 ' ^<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
539 ';
540
541 my $version;
542 my $acc;
543
544 my $cs = $slice->coord_system();
545
546 my $name_str = $cs->name() . ' ' . $slice->seq_region_name();
547
548 $name_str .= ' ' . $cs->version if($cs->version);
549
550 #determine if this slice is the entire seq region
551 #if it is then we just use the name as the id
552 my $slice_adaptor = $slice->adaptor();
553 my $full_slice =
554 $slice->adaptor->fetch_by_region($cs->name,
555 $slice->seq_region_name,
556 undef,undef,undef,
557 $cs->version);
558
559
560 my $entry_name = $slice->seq_region_name();
561
562 if($full_slice->name eq $slice->name) {
563 $name_str .= ' full sequence';
564 $acc = $slice->seq_region_name();
565 my @acc_ver = split(/\./, $acc);
566 if(@acc_ver == 2) {
567 $acc = $acc_ver[0];
568 $version = $acc_ver[0] . $acc_ver[1];
569 } elsif(@acc_ver == 1 && $cs->version()) {
570 $version = $acc . $cs->version();
571 } else {
572 $version = $acc;
573 }
574 } else {
575 $name_str .= ' partial sequence';
576 $acc = $slice->name();
577 $version = $acc;
578 }
579
580 $acc = $slice->name(); # to keep format consistent for all
581
582 my $length = $slice->length;
583 my $start = $slice->start();
584 my $end = $slice->end();
585
586 my $date = $self->_date_string;
587
588 my $meta_container = $slice->adaptor()->db()->get_MetaContainer();
589
590 #LOCUS
591 my $tag = 'LOCUS';
592 my $value = "$entry_name $length bp DNA HTG $date";
593 $self->write($FH, $GENBANK_HEADER, $tag, $value);
594
595 #DEFINITION
596 $tag = "DEFINITION";
597 $value = $meta_container->get_scientific_name() .
598 " $name_str $start..$end reannotated via EnsEMBL";
599 $self->write($FH, $GENBANK_HEADER, $tag, $value);
600
601 #ACCESSION
602 $self->write($FH, $GENBANK_HEADER, 'ACCESSION', $acc);
603
604 #VERSION
605 $self->write($FH, $GENBANK_HEADER, 'VERSION', $version);
606
607 # KEYWORDS
608 $self->write($FH, $GENBANK_HEADER, 'KEYWORDS', '.');
609
610 # SOURCE
611 my $common_name = $meta_container->get_common_name();
612 $common_name = $meta_container->get_scientific_name() unless $common_name;
613 $self->write($FH, $GENBANK_HEADER, 'SOURCE', $common_name);
614
615 #organism
616 my @cls = $meta_container->get_classification();
617 shift @cls;
618 $self->write($FH, $GENBANK_SUBHEADER, 'ORGANISM', $meta_container->get_scientific_name());
619 $self->write($FH, $GENBANK_SUBHEADER, '', join('; ', reverse @cls) . ".");
620
621 #refereneces
622
623 #comments
624 foreach my $comment (@COMMENTS) {
625 $self->write($FH, $GENBANK_HEADER, 'COMMENT', $comment);
626 }
627
628 ####################
629 # DUMP FEATURE TABLE
630 ####################
631 $self->print( $FH, "FEATURES Location/Qualifiers\n" );
632 $self->_dump_feature_table($slice, $FH, $GENBANK_FT);
633
634 ####################
635 # DUMP SEQUENCE
636 ####################
637
638 if(!defined($SEQ)){
639 $SEQ = $slice->seq();
640 }
641 # my $SEQ = $slice->seq();
642 my $a_count = $SEQ =~ tr/aA/aA/;
643 my $c_count = $SEQ =~ tr/cC/cC/;
644 my $t_count = $SEQ =~ tr/tT/tT/;
645 my $g_count = $SEQ =~ tr/gG/gG/;
646 my $bp_length = length($SEQ);
647 my $other_count = $bp_length - $a_count - $c_count - $t_count - $g_count;
648
649 $tag = 'BASE COUNT';
650 $value = "$a_count a $c_count c $g_count g $t_count t";
651 $value .= " $other_count n" if($other_count);
652 $self->print($FH, qq{$tag $value\n});
653 $self->print( $FH, "ORIGIN\n" );
654
655 $self->write_genbank_seq($FH, \$SEQ);
656
657 $self->print( $FH, "//\n" );
658
659 # Set formatting back to normal
660 $: = " \n-";
661 }
662
663
664
665 =head2 _dump_feature_table
666
667 Arg [1] : Bio::EnsEMBL::Slice slice
668 Example : none
669 Description: Helper method used to dump feature tables used in EMBL, FASTA,
670 GENBANK. Assumes formating of file handle has been setup
671 already to use $FEAT and $VALUE values.
672 Returntype : none
673 Exceptions : none
674 Caller : internal
675
676 =cut
677
678 sub _dump_feature_table {
679 my $self = shift;
680 my $slice = shift;
681 my $FH = shift;
682 my $FORMAT = shift;
683
684 #use only the core database to dump features (except for bloody snps)
685 my $lite = $slice->adaptor->db->remove_db_adaptor('lite');
686
687 my $meta = $slice->adaptor->db->get_MetaContainer;
688
689 #lump file handle and format string together for simpler method calls
690 my @ff = ($FH, $FORMAT);
691 my $value;
692
693 #source
694 my $classification = join(', ', $meta->get_classification());
695 $self->write(@ff,'source', "1.." . $slice->length());
696 $self->write(@ff,'' , '/organism="'.$meta->get_scientific_name(). '"');
697 $self->write(@ff,'' , '/db_xref="taxon:'.$meta->get_taxonomy_id().'"');
698
699 #
700 # Transcripts & Genes
701 #
702 my @gene_slices;
703 if($self->is_enabled('gene')) {
704 push @gene_slices, $slice;
705 }
706
707 # Retrieve slices of other database where we need to pull genes from
708
709 my $gene_dbs = {'vegagene' => 'vega',
710 'estgene' => 'estgene'};
711
712 foreach my $gene_type (keys %$gene_dbs) {
713 if($self->is_enabled($gene_type)) {
714 my $db = $self->get_database($gene_dbs->{$gene_type});
715 if($db) {
716 my $sa = $db->get_SliceAdaptor();
717 push @gene_slices, $sa->fetch_by_name($slice->name());
718 } else {
719 warning("A [". $gene_dbs->{$gene_type} ."] database must be " .
720 "attached to this SeqDumper\n(via a call to " .
721 "attach_database) to retrieve genes of type [$gene_type]");
722 }
723 }
724 }
725
726 foreach my $gene_slice (@gene_slices) {
727 my @genes = @{$gene_slice->get_all_Genes(undef,undef, 1)};
728 while(my $gene = shift @genes) {
729 $value = $self->features2location( [$gene] );
730 $self->write( @ff, 'gene', $value );
731 $self->write( @ff, "", '/gene='.$gene->stable_id() );
732
733
734 if(defined($gene->display_xref)){
735 $self->write( @ff, "",'/locus_tag="'.$gene->display_xref->display_id.'"');
736 }
737 my $desc = $gene->description;
738 if(defined($desc) and $desc ne ""){
739 $desc =~ s/\"//;
740 $self->write( @ff, "", '/note="'.$gene->description.'"');
741 }
742
743
744
745 foreach my $transcript (@{$gene->get_all_Transcripts}) {
746 my $translation = $transcript->translation;
747
748 # normal transcripts get dumped differently than pseudogenes
749 if($translation) {
750 #normal transcript
751 $value = $self->features2location($transcript->get_all_Exons);
752 $self->write(@ff, 'mRNA', $value);
753 $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"');
754 $self->write(@ff,''
755 ,'/note="transcript_id='.$transcript->stable_id().'"');
756
757 # ...and a CDS section
758 $value =
759 $self->features2location($transcript->get_all_translateable_Exons);
760 $self->write(@ff,'CDS', $value);
761 my $codon_start = $self->transcript_to_codon_start($transcript);
762 $self->write(@ff,'' , qq{/codon_start="${codon_start}"}) if $codon_start > 1;
763 $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"');
764 $self->write(@ff,'' , '/protein_id="'.$translation->stable_id().'"');
765 $self->write(@ff,'' ,'/note="transcript_id='.$transcript->stable_id().'"');
766
767 foreach my $dbl (@{$transcript->get_all_DBLinks}) {
768 $value = '/db_xref="'.$dbl->dbname().':'.$dbl->display_id().'"';
769 $self->write(@ff, '', $value);
770 }
771
772 $value = '/translation="'.$transcript->translate()->seq().'"';
773 $self->write(@ff, '', $value);
774 } else {
775 #pseudogene
776 $value = $self->features2location($transcript->get_all_Exons);
777 $self->write(@ff, 'misc_RNA', $value);
778 $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"');
779 foreach my $dbl (@{$transcript->get_all_DBLinks}) {
780 $value = '/db_xref="'.$dbl->dbname().':'.$dbl->primary_id().'"';
781 $self->write(@ff, '', $value);
782 }
783 $self->write(@ff,'' , '/note="'.$transcript->biotype().'"');
784 $self->write(@ff,''
785 ,'/note="transcript_id='.$transcript->stable_id().'"');
786 }
787 }
788 }
789
790 # exons
791 foreach my $gene (@{$gene_slice->get_all_Genes(undef,undef,1)}) {
792 foreach my $exon (@{$gene->get_all_Exons}) {
793 $self->write(@ff,'exon', $self->features2location([$exon]));
794 $self->write(@ff,'' , '/note="exon_id='.$exon->stable_id().'"');
795 }
796 }
797 }
798
799 #
800 # genscans
801 #
802 if($self->is_enabled('genscan')) {
803 my @genscan_exons;
804 my @transcripts = @{$slice->get_all_PredictionTranscripts(undef,1)};
805 while(my $transcript = shift @transcripts) {
806 my $exons = $transcript->get_all_Exons();
807 push @genscan_exons, @$exons;
808 $self->write(@ff, 'mRNA', $self->features2location($exons));
809 $self->write(@ff, '', '/product="'.$transcript->translate()->seq().'"');
810 $self->write(@ff, '', '/note="identifier='.$transcript->stable_id.'"');
811 $self->write(@ff, '', '/note="Derived by automated computational' .
812 ' analysis using gene prediction method:' .
813 $transcript->analysis->logic_name . '"');
814 }
815 }
816
817 #
818 # snps
819 #
820 if($self->is_enabled('variation') && $slice->can('get_all_VariationFeatures')) {
821 # $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite;
822
823 my @variations = @{$slice->get_all_VariationFeatures()};
824 while(my $snp = shift @variations) {
825 my $ss = $snp->start;
826 my $se = $snp->end;
827 #skip snps that hang off edge of slice
828 next if($ss < 1 || $se > $slice->length);
829
830 $self->write(@ff, 'variation', "$ss..$se");
831 $self->write(@ff, '' , '/replace="'.$snp->allele_string.'"');
832 #$self->write(@ff, '' , '/evidence="'.$snp->status.'"');
833 my $rs_id = $snp->variation_name();
834 my $db = $snp->source();
835 # foreach my $link ($snp->each_DBLink) {
836 # my $id = $link->primary_id;
837 # my $db = $link->database;
838 $self->write(@ff, '', "/db_xref=\"$db:$rs_id\"");
839 # }
840 }
841
842 # $slice->adaptor->db->remove_db_adaptor('lite') if $lite;
843 }
844
845 #
846 # similarity features
847 #
848 if($self->is_enabled('similarity')) {
849 foreach my $sim (@{$slice->get_all_SimilarityFeatures}) {
850 $self->write(@ff, 'misc_feature', $self->features2location([$sim]));
851 $self->write(@ff, '' , '/note="match: '.$sim->hseqname.
852 ' : '.$sim->hstart.'..'.$sim->hend.'('.$sim->hstrand.')"');
853 }
854 }
855
856 #
857 # repeats
858 #
859 if($self->is_enabled('repeat')) {
860 my @rfs = @{$slice->get_all_RepeatFeatures()};
861
862 while(my $repeat = shift @rfs) {
863 $self->write(@ff, 'repeat_region', $self->features2location([$repeat]));
864 $self->write(@ff, '' , '/note="' . $repeat->repeat_consensus->name.
865 ' repeat: matches ' . $repeat->hstart.'..'.$repeat->hend .
866 '('.$repeat->hstrand.') of consensus"');
867 }
868
869 }
870
871 #
872 # markers
873 #
874 if($self->is_enabled('marker') && $slice->can('get_all_MarkerFeatures')) {
875 my @markers = @{$slice->get_all_MarkerFeatures()};
876 while(my $mf = shift @markers) {
877 $self->write(@ff, 'STS', $self->features2location([$mf]));
878 if($mf->marker->display_MarkerSynonym) {
879 $self->write(@ff, '' , '/standard_name="' .
880 $mf->marker->display_MarkerSynonym->name . '"');
881 }
882
883
884 #grep out synonyms without a source
885 my @synonyms = @{$mf->marker->get_all_MarkerSynonyms};
886 @synonyms = grep {$_->source } @synonyms;
887 foreach my $synonym (@synonyms) {
888 $self->write(@ff, '', '/db_xref="'.$synonym->source.
889 ':'.$synonym->name.'"');
890 }
891 $self->write(@ff, '', '/note="map_weight='.$mf->map_weight.'"');
892 }
893 }
894
895 #
896 # contigs
897 #
898 if($self->is_enabled('contig')) {
899 foreach my $segment (@{$slice->project('seqlevel')}) {
900 my ($start, $end, $slice) = @$segment;
901 $self->write(@ff, 'misc_feature',
902 $start .'..'. $end);
903 $self->write(@ff, '', '/note="contig '.$slice->seq_region_name .
904 ' ' . $slice->start . '..' . $slice->end .
905 '(' . $slice->strand . ')"');
906 }
907 }
908
909 $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite;
910
911 }
912
913 # /codon_start= is the first base to start translating from. This maps
914 # Ensembl start exon phase to this concept. Here we present the usage
915 # of phase in this concept where each row shows the sequence the
916 # spliced_seq() method will return
917
918 # 123456789
919 # ATTATGACG
920 # Phase == 0 ...+++### codon_start=0 // start from 1st A
921 # Phase == 1 ..+++### codon_start=3 // start from 2nd A (base 3 in the given spliced sequence)
922 # Phase == 2 .+++### codon_start=2 // start from 2nd A (base 2 in the spliced sequence)
923 #
924 # In the case of the final 2 phases we will generate a X codon
925 #
926
927 sub transcript_to_codon_start {
928 my ($self, $transcript) = @_;
929 my $start_phase = $transcript->start_Exon()->phase();
930 return ( $start_phase == 1 ) ? 3 :
931 ( $start_phase == 2 ) ? 2 :
932 ( $start_phase == 0 ) ? 1 :
933 -1;
934 }
935
936
937 =head2 dump_fasta
938
939 Arg [1] : Bio::EnsEMBL::Slice
940 Arg [2] : IO::File $FH
941 Example : $seq_dumper->dump_fasta($slice, $FH);
942 Description: Dumps an FASTA flat file to an open file handle
943 Returntype : none
944 Exceptions : none
945 Caller : dump
946
947 =cut
948
949 sub dump_fasta {
950 my $self = shift;
951 my $slice = shift;
952 my $FH = shift;
953
954 my $id = $slice->seq_region_name;
955 my $seqtype = 'dna';
956 my $idtype = $slice->coord_system->name;
957 my $location = $slice->name;
958 my $start = 1;
959 my $end = $slice->length();
960
961 my $header = ">$id $seqtype:$idtype $location\n";
962 $self->print( $FH, $header );
963
964 #set the formatting to FASTA
965 my $FORMAT = '^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
966 ';
967
968 #chunk the sequence in 60kb chunks to use less memory
969 my $cur = $start;
970 while($cur <= $end) {
971 my $to = $cur + 59_999;
972 $to = $end if($to > $end);
973 my $seq = $slice->subseq($cur, $to);
974 $cur = $to + 1;
975 $self->write($FH, $FORMAT, $seq);
976 }
977 }
978
979
980
981 =head2 features2location
982
983 Arg [1] : listref of Bio::EnsEMBL::SeqFeatures
984 Example : $location = $self->features2location(\@features);
985 Description: Constructs an EMBL location string from a list of features
986 Returntype : string
987 Exceptions : none
988 Caller : internal
989
990 =cut
991
992 sub features2location {
993 my $self = shift;
994 my $features = shift;
995
996 my @join = ();
997
998 foreach my $f (@$features) {
999 my $slice = $f->slice;
1000 my $start = $f->start();
1001 my $end = $f->end();
1002 my $strand = $f->strand();
1003
1004 if($start >= 1 && $end <= $slice->length) {
1005 #this feature in on a slice and doesn't lie outside the boundary
1006
1007 if($strand == 1) {
1008 push @join, "$start..$end";
1009 } else {
1010 push @join, "complement($start..$end)";
1011 }
1012 } else {
1013 my @fs = ();
1014 #this feature is outside the boundary of the dump,
1015 # yet implemented and 'seqlevel' is guaranteed to be 1step
1016 my $projection = $f->project('seqlevel');
1017 foreach my $segment (@$projection) {
1018 my $slice = $segment->[2];
1019 my $slc_start = $slice->start();
1020 my $slc_end = $slice->end();
1021 my $seq_reg = $slice->seq_region_name();
1022 if($slice->strand == 1) {
1023 push @join, "$seq_reg:$slc_start..$slc_end";
1024 } else {
1025 push @join, "complement($seq_reg:$slc_start..$slc_end)";
1026 }
1027 }
1028 }
1029 }
1030
1031 my $out = join ',', @join;
1032
1033 if(scalar @join > 1) {
1034 $out = "join($out)";
1035 }
1036
1037 return $out;
1038 }
1039
1040
1041 sub _date_string {
1042 my $self = shift;
1043
1044 my ($sec, $min, $hour, $mday,$mon, $year) = localtime(time());
1045
1046 my $month = ('JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
1047 'AUG', 'SEP', 'OCT', 'NOV', 'DEC')[$mon];
1048 $year += 1900;
1049
1050 return "$mday-$month-$year";
1051 }
1052
1053
1054 sub write {
1055 my ($self, $FH, $FORMAT, @values) = @_;
1056
1057 #while the last value still contains something
1058 while(defined($values[-1]) and $values[-1] ne '') {
1059 formline($FORMAT, @values);
1060 $self->print( $FH, $^A );
1061 $^A = '';
1062 }
1063 }
1064
1065 sub write_genbank_seq {
1066 my $self = shift;
1067 my $FH = shift;
1068 my $seq = shift;
1069 my $base_total = shift;
1070
1071 $base_total ||= 0;
1072
1073 my $GENBANK_SEQ =
1074 '@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~
1075 ';
1076
1077 my $total = -59 + $base_total;
1078 #keep track of total and print lines of 60 bases with spaces every 10bp
1079 while($$seq) {
1080 $total += 60;
1081 formline($GENBANK_SEQ,$total, $$seq, $$seq, $$seq, $$seq, $$seq, $$seq);
1082 $self->print( $FH, $^A );
1083 $^A = '';
1084 }
1085 }
1086
1087 sub write_embl_seq {
1088 my $self = shift;
1089 my $FH = shift;
1090 my $seq = shift;
1091 my $base_total = shift;
1092
1093 $base_total ||= 0;
1094
1095 my $EMBL_SEQ =
1096 ' ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<@>>>>>>>>>~
1097 ';
1098 #keep track of total and print lines of 60 bases with spaces every 10bp
1099 my $length = length($$seq);
1100 my $total = $length - $base_total;
1101 while($$seq) {
1102 $total -= 60;
1103 $total = 0 if($total < 0);
1104 formline($EMBL_SEQ,
1105 $$seq, $$seq, $$seq, $$seq, $$seq, $$seq,
1106 $length - $total);
1107 $self->print( $FH, $^A );
1108 $^A = '';
1109 }
1110 }
1111
1112 sub print {
1113 my( $self, $FH, $string ) = @_;
1114 if(!print $FH $string){
1115 print STDERR "Problem writing to disk\n";
1116 print STDERR "the string is $string\n";
1117 die "Could not write to file handle";
1118 }
1119 }
1120
1121 1;