0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Utils::SeqDumper
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new();
|
|
28
|
|
29 # don't dump snps or repeats
|
|
30 $seq_dumper->disable_feature_type('repeat');
|
|
31 $seq_dumper->disable_feature_type('variation');
|
|
32
|
|
33 # dump EMBL format to STDOUT
|
|
34 $seq_dumper->dump( $slice, 'EMBL' );
|
|
35
|
|
36 # dump GENBANK format to a file
|
|
37 $seq_dumper->dump( $slice, 'GENBANK', 'out.genbank' );
|
|
38
|
|
39 # dump FASTA format to a file
|
|
40 $seq_dumper->dump( $slice, 'FASTA', 'out.fasta' );
|
|
41
|
|
42 =head1 DESCRIPTION
|
|
43
|
|
44 A relatively simple and lite-weight flat file dumper for Ensembl slices.
|
|
45 The memory efficiency could be improved and this is currently not very
|
|
46 good for dumping very large sequences such as whole chromosomes.
|
|
47
|
|
48 =head1 METHODS
|
|
49
|
|
50 =cut
|
|
51
|
|
52
|
|
53 package Bio::EnsEMBL::Utils::SeqDumper;
|
|
54
|
|
55 use strict;
|
|
56
|
|
57 use IO::File;
|
|
58 use vars qw(@ISA);
|
|
59
|
|
60 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
61
|
|
62 #keys must be uppercase
|
|
63 my $DUMP_HANDLERS =
|
|
64 { 'FASTA' => \&dump_fasta,
|
|
65 'EMBL' => \&dump_embl,
|
|
66 'GENBANK' => \&dump_genbank };
|
|
67
|
|
68 my @COMMENTS =
|
|
69 ('This sequence was annotated by the Ensembl system. Please visit ' .
|
|
70 'the Ensembl web site, http://www.ensembl.org/ for more information.',
|
|
71
|
|
72 'All feature locations are relative to the first (5\') base ' .
|
|
73 'of the sequence in this file. The sequence presented is '.
|
|
74 'always the forward strand of the assembly. Features ' .
|
|
75 'that lie outside of the sequence contained in this file ' .
|
|
76 'have clonal location coordinates in the format: ' .
|
|
77 '<clone accession>.<version>:<start>..<end>',
|
|
78
|
|
79 'The /gene indicates a unique id for a gene, /note="transcript_id=..."' .
|
|
80 ' a unique id for a transcript, /protein_id a unique id for a peptide ' .
|
|
81 'and note="exon_id=..." a unique id for an exon. These ids are ' .
|
|
82 'maintained wherever possible between versions.',
|
|
83
|
|
84 'All the exons and transcripts in Ensembl are confirmed by ' .
|
|
85 'similarity to either protein or cDNA sequences.');
|
|
86
|
|
87
|
|
88 =head2 new
|
|
89
|
|
90 Arg [1] : none
|
|
91 Example : $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new;
|
|
92 Description: Creates a new SeqDumper
|
|
93 Returntype : Bio::EnsEMBL::Utils::SeqDumper
|
|
94 Exceptions : none
|
|
95 Caller : general
|
|
96
|
|
97 =cut
|
|
98
|
|
99 sub new {
|
|
100 my ($caller, $slice, $params) = @_;
|
|
101
|
|
102 my $class = ref($caller) || $caller;
|
|
103
|
|
104 my $feature_types = {'gene' => 1,
|
|
105 'genscan' => 1,
|
|
106 'repeat' => 1,
|
|
107 'similarity' => 1,
|
|
108 'variation' => 1,
|
|
109 'contig' => 1,
|
|
110 'marker' => 1,
|
|
111 'estgene' => 0,
|
|
112 'vegagene' => 0};
|
|
113
|
|
114 my $self = bless {'feature_types' => $feature_types}, $class;
|
|
115
|
|
116 foreach my $p (sort keys %{$params || {}}) {
|
|
117 $self->{$p} = $params->{$p};
|
|
118 }
|
|
119
|
|
120 return $self;
|
|
121 }
|
|
122
|
|
123
|
|
124
|
|
125 =head2 enable_feature_type
|
|
126
|
|
127 Arg [1] : string $type
|
|
128 Example : $seq_dumper->enable_feature_type('similarity');
|
|
129 Description: Enables the dumping of a specific type of feature
|
|
130 Returntype : none
|
|
131 Exceptions : warn if invalid feature type is passed,
|
|
132 thrown if no feature type is passed
|
|
133 Caller : general
|
|
134
|
|
135 =cut
|
|
136
|
|
137 sub enable_feature_type {
|
|
138 my ($self, $type) = @_;
|
|
139
|
|
140 $type || throw("type arg is required");
|
|
141
|
|
142 if(exists($self->{'feature_types'}->{$type})) {
|
|
143 $self->{'feature_types'}->{$type} = 1;
|
|
144 } else {
|
|
145 warning("unknown feature type '$type'\n" .
|
|
146 "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
|
|
147 }
|
|
148 }
|
|
149
|
|
150
|
|
151
|
|
152 =head2 attach_database
|
|
153
|
|
154 Arg [1] : string name
|
|
155 Arg [2] : Bio::EnsEMBL::DBSQL::DBAdaptor
|
|
156 Example : $seq_dumper->attach_database('estgene', $estgene_db);
|
|
157 Description: Attaches a database to the seqdumper that can be used to
|
|
158 dump data which is external to the ensembl core database.
|
|
159 Currently this is necessary to dump est genes and vega genes
|
|
160 Returntype : none
|
|
161 Exceptions : thrown if incorrect argument is supplied
|
|
162 Caller : general
|
|
163
|
|
164 =cut
|
|
165
|
|
166 sub attach_database {
|
|
167 my ($self, $name, $db) = @_;
|
|
168
|
|
169 $name || throw("name arg is required");
|
|
170 unless($db && ref($db) && $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) {
|
|
171 throw("db arg must be a Bio::EnsEMBL::DBSQL::DBAdaptor not a [$db]");
|
|
172 }
|
|
173
|
|
174 $self->{'attached_dbs'}->{$name} = $db;
|
|
175 }
|
|
176
|
|
177
|
|
178
|
|
179 =head2 get_database
|
|
180
|
|
181 Arg [1] : string $name
|
|
182 Example : $db = $seq_dumper->get_database('vega');
|
|
183 Description: Retrieves a database that has been attached to the
|
|
184 seqdumper via the attach database call.
|
|
185 Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
|
|
186 Exceptions : thrown if incorrect argument is supplied
|
|
187 Caller : dump_feature_table
|
|
188
|
|
189 =cut
|
|
190
|
|
191 sub get_database {
|
|
192 my ($self, $name) = @_;
|
|
193
|
|
194 $name || throw("name arg is required");
|
|
195
|
|
196 return $self->{'attached_dbs'}->{$name};
|
|
197 }
|
|
198
|
|
199
|
|
200
|
|
201 =head2 remove_database
|
|
202
|
|
203 Arg [1] : string $name
|
|
204 Example : $db = $seq_dumper->remove_database('estgene');
|
|
205 Description: Removes a database that has been attached to the seqdumper
|
|
206 via the attach database call. The database that is removed
|
|
207 is returned (or undef if it did not exist).
|
|
208 Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
|
|
209 Exceptions : thrown if incorrect argument is supplied
|
|
210 Caller : general
|
|
211
|
|
212 =cut
|
|
213
|
|
214 sub remove_database {
|
|
215 my ($self, $name) = @_;
|
|
216
|
|
217 $name || throw("name arg is required");
|
|
218
|
|
219 if(exists $self->{'attached_dbs'}->{$name}) {
|
|
220 return delete $self->{'attached_dbs'}->{$name};
|
|
221 }
|
|
222
|
|
223 return undef;
|
|
224 }
|
|
225
|
|
226
|
|
227 =head2 disable_feature_type
|
|
228
|
|
229 Arg [1] : string $type
|
|
230 Example : $seq_dumper->disable_feature_type('genes');
|
|
231 Description: Disables the dumping of a specific type of feature
|
|
232 Returntype : none
|
|
233 Exceptions : warn if an invalid feature type is passed,
|
|
234 thrown if no feature type is passed
|
|
235 Caller : general
|
|
236
|
|
237 =cut
|
|
238
|
|
239 sub disable_feature_type {
|
|
240 my ($self, $type) = @_;
|
|
241
|
|
242 $type || throw("type arg is required");
|
|
243
|
|
244 if(exists($self->{'feature_types'}->{$type})) {
|
|
245 $self->{'feature_types'}->{$type} = 0;
|
|
246 } else {
|
|
247 warning("unknown feature type '$type'\n" .
|
|
248 "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
|
|
249 }
|
|
250 }
|
|
251
|
|
252
|
|
253
|
|
254 =head2 is_enabled
|
|
255
|
|
256 Arg [1] : string $type
|
|
257 Example : do_something() if($seq_dumper->is_enabled('gene'));
|
|
258 Description: checks if a specific feature type is enabled
|
|
259 Returntype : none
|
|
260 Exceptions : warning if invalid type is passed,
|
|
261 thrown if no type is passed
|
|
262 Caller : general
|
|
263
|
|
264 =cut
|
|
265
|
|
266 sub is_enabled {
|
|
267 my ($self, $type) = @_;
|
|
268
|
|
269 $type || throw("type arg is required");
|
|
270
|
|
271 if(exists($self->{'feature_types'}->{$type})) {
|
|
272 return $self->{'feature_types'}->{$type};
|
|
273 } else {
|
|
274 warning("unknown feature type '$type'\n" .
|
|
275 "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
|
|
276 }
|
|
277 }
|
|
278
|
|
279
|
|
280 =head2 dump
|
|
281
|
|
282 Arg [1] : Bio::EnsEMBL::Slice slice
|
|
283 The slice to dump
|
|
284 Arg [2] : string $format
|
|
285 The name of the format to dump
|
|
286 Arg [3] : (optional) $outfile
|
|
287 The name of the file to dump to. If no file is specified STDOUT
|
|
288 is used
|
|
289 Arg [4] : (optional) $seq
|
|
290 Sequence to dump
|
|
291 Arg [4] : (optional) $no_append
|
|
292 Default action is to open the file in append mode. This will
|
|
293 turn that mode off
|
|
294 Example : $seq_dumper->dump($slice, 'EMBL');
|
|
295 Description: Dumps a region of a genome specified by the slice argument into
|
|
296 an outfile of the format $format
|
|
297 Returntype : none
|
|
298 Exceptions : thrown if slice or format args are not supplied
|
|
299 Caller : general
|
|
300
|
|
301 =cut
|
|
302
|
|
303
|
|
304 sub dump {
|
|
305 my ($self, $slice, $format, $outfile, $seq, $no_append) = @_;
|
|
306
|
|
307 $format || throw("format arg is required");
|
|
308 $slice || throw("slice arg is required");
|
|
309
|
|
310 my $dump_handler = $DUMP_HANDLERS->{uc($format)};
|
|
311
|
|
312 unless($dump_handler) {
|
|
313 throw("No dump handler is defined for format $format\n");
|
|
314 }
|
|
315
|
|
316
|
|
317 my $FH = IO::File->new;;
|
|
318 if($outfile) {
|
|
319 my $mode = ($no_append) ? '>' : '>>';
|
|
320 $FH->open("${mode}${outfile}") or throw("Could not open file $outfile: $!");
|
|
321 } else {
|
|
322 $FH = \*STDOUT;
|
|
323 #mod_perl did not like the following
|
|
324 #$FH->fdopen(fileno(STDOUT), "w")
|
|
325 # or throw("Could not open currently selected output filehandle " .
|
|
326 # "for writing");
|
|
327 }
|
|
328
|
|
329 &$dump_handler($self, $slice, $FH, $seq);
|
|
330
|
|
331 $FH->close if ($outfile); #close if we were writing to a file
|
|
332 }
|
|
333
|
|
334 =head2 dump_embl
|
|
335
|
|
336 Arg [1] : Bio::EnsEMBL::Slice
|
|
337 Arg [2] : IO::File $FH
|
|
338 Arg [3] : optional sequence string
|
|
339 Example : $seq_dumper->dump_embl($slice, $FH);
|
|
340 Description: Dumps an EMBL flat file to an open file handle
|
|
341 Returntype : none
|
|
342 Exceptions : none
|
|
343 Caller : dump
|
|
344
|
|
345 =cut
|
|
346
|
|
347 sub dump_embl {
|
|
348 my $self = shift;
|
|
349 my $slice = shift;
|
|
350 my $FH = shift;
|
|
351 my $SEQ = shift;
|
|
352
|
|
353 my $len = $slice->length;
|
|
354
|
|
355 my $version;
|
|
356 my $acc;
|
|
357
|
|
358 my $cs = $slice->coord_system();
|
|
359 my $name_str = $cs->name() . ' ' . $slice->seq_region_name();
|
|
360 $name_str .= ' ' . $cs->version if($cs->version);
|
|
361
|
|
362 my $start = $slice->start;
|
|
363 my $end = $slice->end;
|
|
364
|
|
365 #determine if this slice is the entire seq region
|
|
366 #if it is then we just use the name as the id
|
|
367 my $slice_adaptor = $slice->adaptor();
|
|
368 my $full_slice =
|
|
369 $slice->adaptor->fetch_by_region($cs->name,
|
|
370 $slice->seq_region_name,
|
|
371 undef,undef,undef,
|
|
372 $cs->version);
|
|
373
|
|
374
|
|
375 my $entry_name = $slice->seq_region_name();
|
|
376
|
|
377
|
|
378
|
|
379 if($full_slice->name eq $slice->name) {
|
|
380 $name_str .= ' full sequence';
|
|
381 $acc = $slice->seq_region_name();
|
|
382 my @acc_ver = split(/\./, $acc);
|
|
383 if(@acc_ver == 2) {
|
|
384 $acc = $acc_ver[0];
|
|
385 $version = $acc_ver[0] . '.'. $acc_ver[1];
|
|
386 } elsif(@acc_ver == 1 && $cs->version()) {
|
|
387 $version = $acc . '.'. $cs->version();
|
|
388 } else {
|
|
389 $version = $acc;
|
|
390 }
|
|
391 } else {
|
|
392 $name_str .= ' partial sequence';
|
|
393 $acc = $slice->name();
|
|
394 $version = $acc;
|
|
395 }
|
|
396
|
|
397 $acc = $slice->name();
|
|
398
|
|
399
|
|
400
|
|
401 #line breaks are allowed near the end of the line on ' ', "\t", "\n", ','
|
|
402 $: = (" \t\n-,");
|
|
403
|
|
404 #############
|
|
405 # dump header
|
|
406 #############
|
|
407
|
|
408 my $EMBL_HEADER =
|
|
409 '@< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
|
|
410 ';
|
|
411
|
|
412 #ID and moltype
|
|
413 # HTG = High Throughput Genome division, probably most suitable
|
|
414 # and it would be hard to come up with another appropriate division
|
|
415 # that worked for all organisms (e.g. plants are in PLN but human is
|
|
416 # in HUM).
|
|
417 my $VALUE = "$entry_name standard; DNA; HTG; $len BP.";
|
|
418 $self->write($FH, $EMBL_HEADER, 'ID', $VALUE);
|
|
419 $self->print( $FH, "XX\n" );
|
|
420
|
|
421 #Accession
|
|
422 $self->write($FH, $EMBL_HEADER, 'AC', $acc);
|
|
423 $self->print( $FH, "XX\n" );
|
|
424
|
|
425 #Version
|
|
426 $self->write($FH, $EMBL_HEADER, 'SV', $version);
|
|
427 $self->print( $FH, "XX\n" );
|
|
428
|
|
429 #Date
|
|
430 $self->write($FH, $EMBL_HEADER, 'DT', $self->_date_string);
|
|
431 $self->print( $FH, "XX\n" );
|
|
432
|
|
433 my $meta_container = $slice->adaptor()->db()->get_MetaContainer();
|
|
434
|
|
435 #Description
|
|
436 $self->write($FH, $EMBL_HEADER, 'DE', $meta_container->get_scientific_name() .
|
|
437 " $name_str $start..$end annotated by Ensembl");
|
|
438 $self->print( $FH, "XX\n" );
|
|
439
|
|
440 #key words
|
|
441 $self->write($FH, $EMBL_HEADER, 'KW', '.');
|
|
442 $self->print( $FH, "XX\n" );
|
|
443
|
|
444 #Species
|
|
445 my $species_name = $meta_container->get_scientific_name();
|
|
446 if(my $cn = $meta_container->get_common_name()) {
|
|
447 $species_name .= " ($cn)";
|
|
448 }
|
|
449
|
|
450 $self->write($FH, $EMBL_HEADER, 'OS', $species_name);
|
|
451
|
|
452 #Classification
|
|
453 my $cls = $meta_container->get_classification();
|
|
454 $self->write($FH, $EMBL_HEADER, 'OC', join('; ', reverse(@{$cls})) . '.');
|
|
455 $self->print( $FH, "XX\n" );
|
|
456
|
|
457 #References (we are not dumping refereneces)
|
|
458
|
|
459 #Database References (we are not dumping these)
|
|
460
|
|
461 #comments
|
|
462 foreach my $comment (@COMMENTS) {
|
|
463 $self->write($FH, $EMBL_HEADER, 'CC', $comment);
|
|
464 $self->print( $FH, "XX\n" );
|
|
465 }
|
|
466
|
|
467 ####################
|
|
468 #DUMP FEATURE TABLE
|
|
469 ####################
|
|
470 $self->print( $FH, "FH Key Location/Qualifiers\n" );
|
|
471
|
|
472 my $FEATURE_TABLE =
|
|
473 'FT ^<<<<<<<<<<<<<<<^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
|
|
474 ';
|
|
475 $self->_dump_feature_table($slice, $FH, $FEATURE_TABLE);
|
|
476
|
|
477 #write an XX after the feature tables
|
|
478 $self->print( $FH, "XX\n" );
|
|
479
|
|
480 ###################
|
|
481 #DUMP SEQUENCE
|
|
482 ###################
|
|
483
|
|
484 if(!defined($SEQ)){
|
|
485 $SEQ = $slice->seq();
|
|
486 }
|
|
487 # my $SEQ = $slice->seq();
|
|
488 my $length = length($SEQ);
|
|
489 my $a_count = $SEQ =~ tr/aA/aA/;
|
|
490 my $c_count = $SEQ =~ tr/cC/cC/;
|
|
491 my $t_count = $SEQ =~ tr/tT/tT/;
|
|
492 my $g_count = $SEQ =~ tr/gG/gG/;
|
|
493 my $other_count = $length - $a_count - $c_count - $t_count - $g_count;
|
|
494
|
|
495 my $value = "Sequence $length BP; $a_count A; $c_count C; " .
|
|
496 "$g_count G; $t_count T; $other_count other;";
|
|
497 $self->print($FH, 'SQ '.$value."\n");
|
|
498
|
|
499 $self->write_embl_seq($FH, \$SEQ);
|
|
500
|
|
501
|
|
502 $self->print( $FH, "//\n" );
|
|
503
|
|
504 # Set formatting back to normal
|
|
505 $: = " \n-";
|
|
506 }
|
|
507
|
|
508
|
|
509
|
|
510
|
|
511 =head2 dump_genbank
|
|
512
|
|
513 Arg [1] : Bio::EnsEMBL::Slice
|
|
514 Arg [2] : IO::File $FH
|
|
515 Example : $seq_dumper->dump_genbank($slice, $FH);
|
|
516 Description: Dumps a GENBANK flat file to an open file handle
|
|
517 Returntype : none
|
|
518 Exceptions : none
|
|
519 Caller : dump
|
|
520
|
|
521 =cut
|
|
522
|
|
523 sub dump_genbank {
|
|
524 my ($self, $slice, $FH, $SEQ) = @_;
|
|
525
|
|
526 #line breaks are allowed near the end of the line on ' ', "\t", "\n", ','
|
|
527 $: = " \t\n-,";
|
|
528
|
|
529 my $GENBANK_HEADER =
|
|
530 '^<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
|
|
531 ';
|
|
532
|
|
533 my $GENBANK_SUBHEADER =
|
|
534 ' ^<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
|
|
535 ';
|
|
536
|
|
537 my $GENBANK_FT =
|
|
538 ' ^<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
|
|
539 ';
|
|
540
|
|
541 my $version;
|
|
542 my $acc;
|
|
543
|
|
544 my $cs = $slice->coord_system();
|
|
545
|
|
546 my $name_str = $cs->name() . ' ' . $slice->seq_region_name();
|
|
547
|
|
548 $name_str .= ' ' . $cs->version if($cs->version);
|
|
549
|
|
550 #determine if this slice is the entire seq region
|
|
551 #if it is then we just use the name as the id
|
|
552 my $slice_adaptor = $slice->adaptor();
|
|
553 my $full_slice =
|
|
554 $slice->adaptor->fetch_by_region($cs->name,
|
|
555 $slice->seq_region_name,
|
|
556 undef,undef,undef,
|
|
557 $cs->version);
|
|
558
|
|
559
|
|
560 my $entry_name = $slice->seq_region_name();
|
|
561
|
|
562 if($full_slice->name eq $slice->name) {
|
|
563 $name_str .= ' full sequence';
|
|
564 $acc = $slice->seq_region_name();
|
|
565 my @acc_ver = split(/\./, $acc);
|
|
566 if(@acc_ver == 2) {
|
|
567 $acc = $acc_ver[0];
|
|
568 $version = $acc_ver[0] . $acc_ver[1];
|
|
569 } elsif(@acc_ver == 1 && $cs->version()) {
|
|
570 $version = $acc . $cs->version();
|
|
571 } else {
|
|
572 $version = $acc;
|
|
573 }
|
|
574 } else {
|
|
575 $name_str .= ' partial sequence';
|
|
576 $acc = $slice->name();
|
|
577 $version = $acc;
|
|
578 }
|
|
579
|
|
580 $acc = $slice->name(); # to keep format consistent for all
|
|
581
|
|
582 my $length = $slice->length;
|
|
583 my $start = $slice->start();
|
|
584 my $end = $slice->end();
|
|
585
|
|
586 my $date = $self->_date_string;
|
|
587
|
|
588 my $meta_container = $slice->adaptor()->db()->get_MetaContainer();
|
|
589
|
|
590 #LOCUS
|
|
591 my $tag = 'LOCUS';
|
|
592 my $value = "$entry_name $length bp DNA HTG $date";
|
|
593 $self->write($FH, $GENBANK_HEADER, $tag, $value);
|
|
594
|
|
595 #DEFINITION
|
|
596 $tag = "DEFINITION";
|
|
597 $value = $meta_container->get_scientific_name() .
|
|
598 " $name_str $start..$end reannotated via EnsEMBL";
|
|
599 $self->write($FH, $GENBANK_HEADER, $tag, $value);
|
|
600
|
|
601 #ACCESSION
|
|
602 $self->write($FH, $GENBANK_HEADER, 'ACCESSION', $acc);
|
|
603
|
|
604 #VERSION
|
|
605 $self->write($FH, $GENBANK_HEADER, 'VERSION', $version);
|
|
606
|
|
607 # KEYWORDS
|
|
608 $self->write($FH, $GENBANK_HEADER, 'KEYWORDS', '.');
|
|
609
|
|
610 # SOURCE
|
|
611 my $common_name = $meta_container->get_common_name();
|
|
612 $common_name = $meta_container->get_scientific_name() unless $common_name;
|
|
613 $self->write($FH, $GENBANK_HEADER, 'SOURCE', $common_name);
|
|
614
|
|
615 #organism
|
|
616 my @cls = $meta_container->get_classification();
|
|
617 shift @cls;
|
|
618 $self->write($FH, $GENBANK_SUBHEADER, 'ORGANISM', $meta_container->get_scientific_name());
|
|
619 $self->write($FH, $GENBANK_SUBHEADER, '', join('; ', reverse @cls) . ".");
|
|
620
|
|
621 #refereneces
|
|
622
|
|
623 #comments
|
|
624 foreach my $comment (@COMMENTS) {
|
|
625 $self->write($FH, $GENBANK_HEADER, 'COMMENT', $comment);
|
|
626 }
|
|
627
|
|
628 ####################
|
|
629 # DUMP FEATURE TABLE
|
|
630 ####################
|
|
631 $self->print( $FH, "FEATURES Location/Qualifiers\n" );
|
|
632 $self->_dump_feature_table($slice, $FH, $GENBANK_FT);
|
|
633
|
|
634 ####################
|
|
635 # DUMP SEQUENCE
|
|
636 ####################
|
|
637
|
|
638 if(!defined($SEQ)){
|
|
639 $SEQ = $slice->seq();
|
|
640 }
|
|
641 # my $SEQ = $slice->seq();
|
|
642 my $a_count = $SEQ =~ tr/aA/aA/;
|
|
643 my $c_count = $SEQ =~ tr/cC/cC/;
|
|
644 my $t_count = $SEQ =~ tr/tT/tT/;
|
|
645 my $g_count = $SEQ =~ tr/gG/gG/;
|
|
646 my $bp_length = length($SEQ);
|
|
647 my $other_count = $bp_length - $a_count - $c_count - $t_count - $g_count;
|
|
648
|
|
649 $tag = 'BASE COUNT';
|
|
650 $value = "$a_count a $c_count c $g_count g $t_count t";
|
|
651 $value .= " $other_count n" if($other_count);
|
|
652 $self->print($FH, qq{$tag $value\n});
|
|
653 $self->print( $FH, "ORIGIN\n" );
|
|
654
|
|
655 $self->write_genbank_seq($FH, \$SEQ);
|
|
656
|
|
657 $self->print( $FH, "//\n" );
|
|
658
|
|
659 # Set formatting back to normal
|
|
660 $: = " \n-";
|
|
661 }
|
|
662
|
|
663
|
|
664
|
|
665 =head2 _dump_feature_table
|
|
666
|
|
667 Arg [1] : Bio::EnsEMBL::Slice slice
|
|
668 Example : none
|
|
669 Description: Helper method used to dump feature tables used in EMBL, FASTA,
|
|
670 GENBANK. Assumes formating of file handle has been setup
|
|
671 already to use $FEAT and $VALUE values.
|
|
672 Returntype : none
|
|
673 Exceptions : none
|
|
674 Caller : internal
|
|
675
|
|
676 =cut
|
|
677
|
|
678 sub _dump_feature_table {
|
|
679 my $self = shift;
|
|
680 my $slice = shift;
|
|
681 my $FH = shift;
|
|
682 my $FORMAT = shift;
|
|
683
|
|
684 #use only the core database to dump features (except for bloody snps)
|
|
685 my $lite = $slice->adaptor->db->remove_db_adaptor('lite');
|
|
686
|
|
687 my $meta = $slice->adaptor->db->get_MetaContainer;
|
|
688
|
|
689 #lump file handle and format string together for simpler method calls
|
|
690 my @ff = ($FH, $FORMAT);
|
|
691 my $value;
|
|
692
|
|
693 #source
|
|
694 my $classification = join(', ', $meta->get_classification());
|
|
695 $self->write(@ff,'source', "1.." . $slice->length());
|
|
696 $self->write(@ff,'' , '/organism="'.$meta->get_scientific_name(). '"');
|
|
697 $self->write(@ff,'' , '/db_xref="taxon:'.$meta->get_taxonomy_id().'"');
|
|
698
|
|
699 #
|
|
700 # Transcripts & Genes
|
|
701 #
|
|
702 my @gene_slices;
|
|
703 if($self->is_enabled('gene')) {
|
|
704 push @gene_slices, $slice;
|
|
705 }
|
|
706
|
|
707 # Retrieve slices of other database where we need to pull genes from
|
|
708
|
|
709 my $gene_dbs = {'vegagene' => 'vega',
|
|
710 'estgene' => 'estgene'};
|
|
711
|
|
712 foreach my $gene_type (keys %$gene_dbs) {
|
|
713 if($self->is_enabled($gene_type)) {
|
|
714 my $db = $self->get_database($gene_dbs->{$gene_type});
|
|
715 if($db) {
|
|
716 my $sa = $db->get_SliceAdaptor();
|
|
717 push @gene_slices, $sa->fetch_by_name($slice->name());
|
|
718 } else {
|
|
719 warning("A [". $gene_dbs->{$gene_type} ."] database must be " .
|
|
720 "attached to this SeqDumper\n(via a call to " .
|
|
721 "attach_database) to retrieve genes of type [$gene_type]");
|
|
722 }
|
|
723 }
|
|
724 }
|
|
725
|
|
726 foreach my $gene_slice (@gene_slices) {
|
|
727 my @genes = @{$gene_slice->get_all_Genes(undef,undef, 1)};
|
|
728 while(my $gene = shift @genes) {
|
|
729 $value = $self->features2location( [$gene] );
|
|
730 $self->write( @ff, 'gene', $value );
|
|
731 $self->write( @ff, "", '/gene='.$gene->stable_id() );
|
|
732
|
|
733
|
|
734 if(defined($gene->display_xref)){
|
|
735 $self->write( @ff, "",'/locus_tag="'.$gene->display_xref->display_id.'"');
|
|
736 }
|
|
737 my $desc = $gene->description;
|
|
738 if(defined($desc) and $desc ne ""){
|
|
739 $desc =~ s/\"//;
|
|
740 $self->write( @ff, "", '/note="'.$gene->description.'"');
|
|
741 }
|
|
742
|
|
743
|
|
744
|
|
745 foreach my $transcript (@{$gene->get_all_Transcripts}) {
|
|
746 my $translation = $transcript->translation;
|
|
747
|
|
748 # normal transcripts get dumped differently than pseudogenes
|
|
749 if($translation) {
|
|
750 #normal transcript
|
|
751 $value = $self->features2location($transcript->get_all_Exons);
|
|
752 $self->write(@ff, 'mRNA', $value);
|
|
753 $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"');
|
|
754 $self->write(@ff,''
|
|
755 ,'/note="transcript_id='.$transcript->stable_id().'"');
|
|
756
|
|
757 # ...and a CDS section
|
|
758 $value =
|
|
759 $self->features2location($transcript->get_all_translateable_Exons);
|
|
760 $self->write(@ff,'CDS', $value);
|
|
761 my $codon_start = $self->transcript_to_codon_start($transcript);
|
|
762 $self->write(@ff,'' , qq{/codon_start="${codon_start}"}) if $codon_start > 1;
|
|
763 $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"');
|
|
764 $self->write(@ff,'' , '/protein_id="'.$translation->stable_id().'"');
|
|
765 $self->write(@ff,'' ,'/note="transcript_id='.$transcript->stable_id().'"');
|
|
766
|
|
767 foreach my $dbl (@{$transcript->get_all_DBLinks}) {
|
|
768 $value = '/db_xref="'.$dbl->dbname().':'.$dbl->display_id().'"';
|
|
769 $self->write(@ff, '', $value);
|
|
770 }
|
|
771
|
|
772 $value = '/translation="'.$transcript->translate()->seq().'"';
|
|
773 $self->write(@ff, '', $value);
|
|
774 } else {
|
|
775 #pseudogene
|
|
776 $value = $self->features2location($transcript->get_all_Exons);
|
|
777 $self->write(@ff, 'misc_RNA', $value);
|
|
778 $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"');
|
|
779 foreach my $dbl (@{$transcript->get_all_DBLinks}) {
|
|
780 $value = '/db_xref="'.$dbl->dbname().':'.$dbl->primary_id().'"';
|
|
781 $self->write(@ff, '', $value);
|
|
782 }
|
|
783 $self->write(@ff,'' , '/note="'.$transcript->biotype().'"');
|
|
784 $self->write(@ff,''
|
|
785 ,'/note="transcript_id='.$transcript->stable_id().'"');
|
|
786 }
|
|
787 }
|
|
788 }
|
|
789
|
|
790 # exons
|
|
791 foreach my $gene (@{$gene_slice->get_all_Genes(undef,undef,1)}) {
|
|
792 foreach my $exon (@{$gene->get_all_Exons}) {
|
|
793 $self->write(@ff,'exon', $self->features2location([$exon]));
|
|
794 $self->write(@ff,'' , '/note="exon_id='.$exon->stable_id().'"');
|
|
795 }
|
|
796 }
|
|
797 }
|
|
798
|
|
799 #
|
|
800 # genscans
|
|
801 #
|
|
802 if($self->is_enabled('genscan')) {
|
|
803 my @genscan_exons;
|
|
804 my @transcripts = @{$slice->get_all_PredictionTranscripts(undef,1)};
|
|
805 while(my $transcript = shift @transcripts) {
|
|
806 my $exons = $transcript->get_all_Exons();
|
|
807 push @genscan_exons, @$exons;
|
|
808 $self->write(@ff, 'mRNA', $self->features2location($exons));
|
|
809 $self->write(@ff, '', '/product="'.$transcript->translate()->seq().'"');
|
|
810 $self->write(@ff, '', '/note="identifier='.$transcript->stable_id.'"');
|
|
811 $self->write(@ff, '', '/note="Derived by automated computational' .
|
|
812 ' analysis using gene prediction method:' .
|
|
813 $transcript->analysis->logic_name . '"');
|
|
814 }
|
|
815 }
|
|
816
|
|
817 #
|
|
818 # snps
|
|
819 #
|
|
820 if($self->is_enabled('variation') && $slice->can('get_all_VariationFeatures')) {
|
|
821 # $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite;
|
|
822
|
|
823 my @variations = @{$slice->get_all_VariationFeatures()};
|
|
824 while(my $snp = shift @variations) {
|
|
825 my $ss = $snp->start;
|
|
826 my $se = $snp->end;
|
|
827 #skip snps that hang off edge of slice
|
|
828 next if($ss < 1 || $se > $slice->length);
|
|
829
|
|
830 $self->write(@ff, 'variation', "$ss..$se");
|
|
831 $self->write(@ff, '' , '/replace="'.$snp->allele_string.'"');
|
|
832 #$self->write(@ff, '' , '/evidence="'.$snp->status.'"');
|
|
833 my $rs_id = $snp->variation_name();
|
|
834 my $db = $snp->source();
|
|
835 # foreach my $link ($snp->each_DBLink) {
|
|
836 # my $id = $link->primary_id;
|
|
837 # my $db = $link->database;
|
|
838 $self->write(@ff, '', "/db_xref=\"$db:$rs_id\"");
|
|
839 # }
|
|
840 }
|
|
841
|
|
842 # $slice->adaptor->db->remove_db_adaptor('lite') if $lite;
|
|
843 }
|
|
844
|
|
845 #
|
|
846 # similarity features
|
|
847 #
|
|
848 if($self->is_enabled('similarity')) {
|
|
849 foreach my $sim (@{$slice->get_all_SimilarityFeatures}) {
|
|
850 $self->write(@ff, 'misc_feature', $self->features2location([$sim]));
|
|
851 $self->write(@ff, '' , '/note="match: '.$sim->hseqname.
|
|
852 ' : '.$sim->hstart.'..'.$sim->hend.'('.$sim->hstrand.')"');
|
|
853 }
|
|
854 }
|
|
855
|
|
856 #
|
|
857 # repeats
|
|
858 #
|
|
859 if($self->is_enabled('repeat')) {
|
|
860 my @rfs = @{$slice->get_all_RepeatFeatures()};
|
|
861
|
|
862 while(my $repeat = shift @rfs) {
|
|
863 $self->write(@ff, 'repeat_region', $self->features2location([$repeat]));
|
|
864 $self->write(@ff, '' , '/note="' . $repeat->repeat_consensus->name.
|
|
865 ' repeat: matches ' . $repeat->hstart.'..'.$repeat->hend .
|
|
866 '('.$repeat->hstrand.') of consensus"');
|
|
867 }
|
|
868
|
|
869 }
|
|
870
|
|
871 #
|
|
872 # markers
|
|
873 #
|
|
874 if($self->is_enabled('marker') && $slice->can('get_all_MarkerFeatures')) {
|
|
875 my @markers = @{$slice->get_all_MarkerFeatures()};
|
|
876 while(my $mf = shift @markers) {
|
|
877 $self->write(@ff, 'STS', $self->features2location([$mf]));
|
|
878 if($mf->marker->display_MarkerSynonym) {
|
|
879 $self->write(@ff, '' , '/standard_name="' .
|
|
880 $mf->marker->display_MarkerSynonym->name . '"');
|
|
881 }
|
|
882
|
|
883
|
|
884 #grep out synonyms without a source
|
|
885 my @synonyms = @{$mf->marker->get_all_MarkerSynonyms};
|
|
886 @synonyms = grep {$_->source } @synonyms;
|
|
887 foreach my $synonym (@synonyms) {
|
|
888 $self->write(@ff, '', '/db_xref="'.$synonym->source.
|
|
889 ':'.$synonym->name.'"');
|
|
890 }
|
|
891 $self->write(@ff, '', '/note="map_weight='.$mf->map_weight.'"');
|
|
892 }
|
|
893 }
|
|
894
|
|
895 #
|
|
896 # contigs
|
|
897 #
|
|
898 if($self->is_enabled('contig')) {
|
|
899 foreach my $segment (@{$slice->project('seqlevel')}) {
|
|
900 my ($start, $end, $slice) = @$segment;
|
|
901 $self->write(@ff, 'misc_feature',
|
|
902 $start .'..'. $end);
|
|
903 $self->write(@ff, '', '/note="contig '.$slice->seq_region_name .
|
|
904 ' ' . $slice->start . '..' . $slice->end .
|
|
905 '(' . $slice->strand . ')"');
|
|
906 }
|
|
907 }
|
|
908
|
|
909 $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite;
|
|
910
|
|
911 }
|
|
912
|
|
913 # /codon_start= is the first base to start translating from. This maps
|
|
914 # Ensembl start exon phase to this concept. Here we present the usage
|
|
915 # of phase in this concept where each row shows the sequence the
|
|
916 # spliced_seq() method will return
|
|
917
|
|
918 # 123456789
|
|
919 # ATTATGACG
|
|
920 # Phase == 0 ...+++### codon_start=0 // start from 1st A
|
|
921 # Phase == 1 ..+++### codon_start=3 // start from 2nd A (base 3 in the given spliced sequence)
|
|
922 # Phase == 2 .+++### codon_start=2 // start from 2nd A (base 2 in the spliced sequence)
|
|
923 #
|
|
924 # In the case of the final 2 phases we will generate a X codon
|
|
925 #
|
|
926
|
|
927 sub transcript_to_codon_start {
|
|
928 my ($self, $transcript) = @_;
|
|
929 my $start_phase = $transcript->start_Exon()->phase();
|
|
930 return ( $start_phase == 1 ) ? 3 :
|
|
931 ( $start_phase == 2 ) ? 2 :
|
|
932 ( $start_phase == 0 ) ? 1 :
|
|
933 -1;
|
|
934 }
|
|
935
|
|
936
|
|
937 =head2 dump_fasta
|
|
938
|
|
939 Arg [1] : Bio::EnsEMBL::Slice
|
|
940 Arg [2] : IO::File $FH
|
|
941 Example : $seq_dumper->dump_fasta($slice, $FH);
|
|
942 Description: Dumps an FASTA flat file to an open file handle
|
|
943 Returntype : none
|
|
944 Exceptions : none
|
|
945 Caller : dump
|
|
946
|
|
947 =cut
|
|
948
|
|
949 sub dump_fasta {
|
|
950 my $self = shift;
|
|
951 my $slice = shift;
|
|
952 my $FH = shift;
|
|
953
|
|
954 my $id = $slice->seq_region_name;
|
|
955 my $seqtype = 'dna';
|
|
956 my $idtype = $slice->coord_system->name;
|
|
957 my $location = $slice->name;
|
|
958 my $start = 1;
|
|
959 my $end = $slice->length();
|
|
960
|
|
961 my $header = ">$id $seqtype:$idtype $location\n";
|
|
962 $self->print( $FH, $header );
|
|
963
|
|
964 #set the formatting to FASTA
|
|
965 my $FORMAT = '^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
|
966 ';
|
|
967
|
|
968 #chunk the sequence in 60kb chunks to use less memory
|
|
969 my $cur = $start;
|
|
970 while($cur <= $end) {
|
|
971 my $to = $cur + 59_999;
|
|
972 $to = $end if($to > $end);
|
|
973 my $seq = $slice->subseq($cur, $to);
|
|
974 $cur = $to + 1;
|
|
975 $self->write($FH, $FORMAT, $seq);
|
|
976 }
|
|
977 }
|
|
978
|
|
979
|
|
980
|
|
981 =head2 features2location
|
|
982
|
|
983 Arg [1] : listref of Bio::EnsEMBL::SeqFeatures
|
|
984 Example : $location = $self->features2location(\@features);
|
|
985 Description: Constructs an EMBL location string from a list of features
|
|
986 Returntype : string
|
|
987 Exceptions : none
|
|
988 Caller : internal
|
|
989
|
|
990 =cut
|
|
991
|
|
992 sub features2location {
|
|
993 my $self = shift;
|
|
994 my $features = shift;
|
|
995
|
|
996 my @join = ();
|
|
997
|
|
998 foreach my $f (@$features) {
|
|
999 my $slice = $f->slice;
|
|
1000 my $start = $f->start();
|
|
1001 my $end = $f->end();
|
|
1002 my $strand = $f->strand();
|
|
1003
|
|
1004 if($start >= 1 && $end <= $slice->length) {
|
|
1005 #this feature in on a slice and doesn't lie outside the boundary
|
|
1006
|
|
1007 if($strand == 1) {
|
|
1008 push @join, "$start..$end";
|
|
1009 } else {
|
|
1010 push @join, "complement($start..$end)";
|
|
1011 }
|
|
1012 } else {
|
|
1013 my @fs = ();
|
|
1014 #this feature is outside the boundary of the dump,
|
|
1015 # yet implemented and 'seqlevel' is guaranteed to be 1step
|
|
1016 my $projection = $f->project('seqlevel');
|
|
1017 foreach my $segment (@$projection) {
|
|
1018 my $slice = $segment->[2];
|
|
1019 my $slc_start = $slice->start();
|
|
1020 my $slc_end = $slice->end();
|
|
1021 my $seq_reg = $slice->seq_region_name();
|
|
1022 if($slice->strand == 1) {
|
|
1023 push @join, "$seq_reg:$slc_start..$slc_end";
|
|
1024 } else {
|
|
1025 push @join, "complement($seq_reg:$slc_start..$slc_end)";
|
|
1026 }
|
|
1027 }
|
|
1028 }
|
|
1029 }
|
|
1030
|
|
1031 my $out = join ',', @join;
|
|
1032
|
|
1033 if(scalar @join > 1) {
|
|
1034 $out = "join($out)";
|
|
1035 }
|
|
1036
|
|
1037 return $out;
|
|
1038 }
|
|
1039
|
|
1040
|
|
1041 sub _date_string {
|
|
1042 my $self = shift;
|
|
1043
|
|
1044 my ($sec, $min, $hour, $mday,$mon, $year) = localtime(time());
|
|
1045
|
|
1046 my $month = ('JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
|
|
1047 'AUG', 'SEP', 'OCT', 'NOV', 'DEC')[$mon];
|
|
1048 $year += 1900;
|
|
1049
|
|
1050 return "$mday-$month-$year";
|
|
1051 }
|
|
1052
|
|
1053
|
|
1054 sub write {
|
|
1055 my ($self, $FH, $FORMAT, @values) = @_;
|
|
1056
|
|
1057 #while the last value still contains something
|
|
1058 while(defined($values[-1]) and $values[-1] ne '') {
|
|
1059 formline($FORMAT, @values);
|
|
1060 $self->print( $FH, $^A );
|
|
1061 $^A = '';
|
|
1062 }
|
|
1063 }
|
|
1064
|
|
1065 sub write_genbank_seq {
|
|
1066 my $self = shift;
|
|
1067 my $FH = shift;
|
|
1068 my $seq = shift;
|
|
1069 my $base_total = shift;
|
|
1070
|
|
1071 $base_total ||= 0;
|
|
1072
|
|
1073 my $GENBANK_SEQ =
|
|
1074 '@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~
|
|
1075 ';
|
|
1076
|
|
1077 my $total = -59 + $base_total;
|
|
1078 #keep track of total and print lines of 60 bases with spaces every 10bp
|
|
1079 while($$seq) {
|
|
1080 $total += 60;
|
|
1081 formline($GENBANK_SEQ,$total, $$seq, $$seq, $$seq, $$seq, $$seq, $$seq);
|
|
1082 $self->print( $FH, $^A );
|
|
1083 $^A = '';
|
|
1084 }
|
|
1085 }
|
|
1086
|
|
1087 sub write_embl_seq {
|
|
1088 my $self = shift;
|
|
1089 my $FH = shift;
|
|
1090 my $seq = shift;
|
|
1091 my $base_total = shift;
|
|
1092
|
|
1093 $base_total ||= 0;
|
|
1094
|
|
1095 my $EMBL_SEQ =
|
|
1096 ' ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<@>>>>>>>>>~
|
|
1097 ';
|
|
1098 #keep track of total and print lines of 60 bases with spaces every 10bp
|
|
1099 my $length = length($$seq);
|
|
1100 my $total = $length - $base_total;
|
|
1101 while($$seq) {
|
|
1102 $total -= 60;
|
|
1103 $total = 0 if($total < 0);
|
|
1104 formline($EMBL_SEQ,
|
|
1105 $$seq, $$seq, $$seq, $$seq, $$seq, $$seq,
|
|
1106 $length - $total);
|
|
1107 $self->print( $FH, $^A );
|
|
1108 $^A = '';
|
|
1109 }
|
|
1110 }
|
|
1111
|
|
1112 sub print {
|
|
1113 my( $self, $FH, $string ) = @_;
|
|
1114 if(!print $FH $string){
|
|
1115 print STDERR "Problem writing to disk\n";
|
|
1116 print STDERR "the string is $string\n";
|
|
1117 die "Could not write to file handle";
|
|
1118 }
|
|
1119 }
|
|
1120
|
|
1121 1;
|