Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Utils/SeqDumper.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Utils::SeqDumper | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new(); | |
28 | |
29 # don't dump snps or repeats | |
30 $seq_dumper->disable_feature_type('repeat'); | |
31 $seq_dumper->disable_feature_type('variation'); | |
32 | |
33 # dump EMBL format to STDOUT | |
34 $seq_dumper->dump( $slice, 'EMBL' ); | |
35 | |
36 # dump GENBANK format to a file | |
37 $seq_dumper->dump( $slice, 'GENBANK', 'out.genbank' ); | |
38 | |
39 # dump FASTA format to a file | |
40 $seq_dumper->dump( $slice, 'FASTA', 'out.fasta' ); | |
41 | |
42 =head1 DESCRIPTION | |
43 | |
44 A relatively simple and lite-weight flat file dumper for Ensembl slices. | |
45 The memory efficiency could be improved and this is currently not very | |
46 good for dumping very large sequences such as whole chromosomes. | |
47 | |
48 =head1 METHODS | |
49 | |
50 =cut | |
51 | |
52 | |
53 package Bio::EnsEMBL::Utils::SeqDumper; | |
54 | |
55 use strict; | |
56 | |
57 use IO::File; | |
58 use vars qw(@ISA); | |
59 | |
60 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
61 | |
62 #keys must be uppercase | |
63 my $DUMP_HANDLERS = | |
64 { 'FASTA' => \&dump_fasta, | |
65 'EMBL' => \&dump_embl, | |
66 'GENBANK' => \&dump_genbank }; | |
67 | |
68 my @COMMENTS = | |
69 ('This sequence was annotated by the Ensembl system. Please visit ' . | |
70 'the Ensembl web site, http://www.ensembl.org/ for more information.', | |
71 | |
72 'All feature locations are relative to the first (5\') base ' . | |
73 'of the sequence in this file. The sequence presented is '. | |
74 'always the forward strand of the assembly. Features ' . | |
75 'that lie outside of the sequence contained in this file ' . | |
76 'have clonal location coordinates in the format: ' . | |
77 '<clone accession>.<version>:<start>..<end>', | |
78 | |
79 'The /gene indicates a unique id for a gene, /note="transcript_id=..."' . | |
80 ' a unique id for a transcript, /protein_id a unique id for a peptide ' . | |
81 'and note="exon_id=..." a unique id for an exon. These ids are ' . | |
82 'maintained wherever possible between versions.', | |
83 | |
84 'All the exons and transcripts in Ensembl are confirmed by ' . | |
85 'similarity to either protein or cDNA sequences.'); | |
86 | |
87 | |
88 =head2 new | |
89 | |
90 Arg [1] : none | |
91 Example : $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new; | |
92 Description: Creates a new SeqDumper | |
93 Returntype : Bio::EnsEMBL::Utils::SeqDumper | |
94 Exceptions : none | |
95 Caller : general | |
96 | |
97 =cut | |
98 | |
99 sub new { | |
100 my ($caller, $slice, $params) = @_; | |
101 | |
102 my $class = ref($caller) || $caller; | |
103 | |
104 my $feature_types = {'gene' => 1, | |
105 'genscan' => 1, | |
106 'repeat' => 1, | |
107 'similarity' => 1, | |
108 'variation' => 1, | |
109 'contig' => 1, | |
110 'marker' => 1, | |
111 'estgene' => 0, | |
112 'vegagene' => 0}; | |
113 | |
114 my $self = bless {'feature_types' => $feature_types}, $class; | |
115 | |
116 foreach my $p (sort keys %{$params || {}}) { | |
117 $self->{$p} = $params->{$p}; | |
118 } | |
119 | |
120 return $self; | |
121 } | |
122 | |
123 | |
124 | |
125 =head2 enable_feature_type | |
126 | |
127 Arg [1] : string $type | |
128 Example : $seq_dumper->enable_feature_type('similarity'); | |
129 Description: Enables the dumping of a specific type of feature | |
130 Returntype : none | |
131 Exceptions : warn if invalid feature type is passed, | |
132 thrown if no feature type is passed | |
133 Caller : general | |
134 | |
135 =cut | |
136 | |
137 sub enable_feature_type { | |
138 my ($self, $type) = @_; | |
139 | |
140 $type || throw("type arg is required"); | |
141 | |
142 if(exists($self->{'feature_types'}->{$type})) { | |
143 $self->{'feature_types'}->{$type} = 1; | |
144 } else { | |
145 warning("unknown feature type '$type'\n" . | |
146 "valid types are: " . join(',', keys %{$self->{'feature_types'}})); | |
147 } | |
148 } | |
149 | |
150 | |
151 | |
152 =head2 attach_database | |
153 | |
154 Arg [1] : string name | |
155 Arg [2] : Bio::EnsEMBL::DBSQL::DBAdaptor | |
156 Example : $seq_dumper->attach_database('estgene', $estgene_db); | |
157 Description: Attaches a database to the seqdumper that can be used to | |
158 dump data which is external to the ensembl core database. | |
159 Currently this is necessary to dump est genes and vega genes | |
160 Returntype : none | |
161 Exceptions : thrown if incorrect argument is supplied | |
162 Caller : general | |
163 | |
164 =cut | |
165 | |
166 sub attach_database { | |
167 my ($self, $name, $db) = @_; | |
168 | |
169 $name || throw("name arg is required"); | |
170 unless($db && ref($db) && $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) { | |
171 throw("db arg must be a Bio::EnsEMBL::DBSQL::DBAdaptor not a [$db]"); | |
172 } | |
173 | |
174 $self->{'attached_dbs'}->{$name} = $db; | |
175 } | |
176 | |
177 | |
178 | |
179 =head2 get_database | |
180 | |
181 Arg [1] : string $name | |
182 Example : $db = $seq_dumper->get_database('vega'); | |
183 Description: Retrieves a database that has been attached to the | |
184 seqdumper via the attach database call. | |
185 Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor | |
186 Exceptions : thrown if incorrect argument is supplied | |
187 Caller : dump_feature_table | |
188 | |
189 =cut | |
190 | |
191 sub get_database { | |
192 my ($self, $name) = @_; | |
193 | |
194 $name || throw("name arg is required"); | |
195 | |
196 return $self->{'attached_dbs'}->{$name}; | |
197 } | |
198 | |
199 | |
200 | |
201 =head2 remove_database | |
202 | |
203 Arg [1] : string $name | |
204 Example : $db = $seq_dumper->remove_database('estgene'); | |
205 Description: Removes a database that has been attached to the seqdumper | |
206 via the attach database call. The database that is removed | |
207 is returned (or undef if it did not exist). | |
208 Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor | |
209 Exceptions : thrown if incorrect argument is supplied | |
210 Caller : general | |
211 | |
212 =cut | |
213 | |
214 sub remove_database { | |
215 my ($self, $name) = @_; | |
216 | |
217 $name || throw("name arg is required"); | |
218 | |
219 if(exists $self->{'attached_dbs'}->{$name}) { | |
220 return delete $self->{'attached_dbs'}->{$name}; | |
221 } | |
222 | |
223 return undef; | |
224 } | |
225 | |
226 | |
227 =head2 disable_feature_type | |
228 | |
229 Arg [1] : string $type | |
230 Example : $seq_dumper->disable_feature_type('genes'); | |
231 Description: Disables the dumping of a specific type of feature | |
232 Returntype : none | |
233 Exceptions : warn if an invalid feature type is passed, | |
234 thrown if no feature type is passed | |
235 Caller : general | |
236 | |
237 =cut | |
238 | |
239 sub disable_feature_type { | |
240 my ($self, $type) = @_; | |
241 | |
242 $type || throw("type arg is required"); | |
243 | |
244 if(exists($self->{'feature_types'}->{$type})) { | |
245 $self->{'feature_types'}->{$type} = 0; | |
246 } else { | |
247 warning("unknown feature type '$type'\n" . | |
248 "valid types are: " . join(',', keys %{$self->{'feature_types'}})); | |
249 } | |
250 } | |
251 | |
252 | |
253 | |
254 =head2 is_enabled | |
255 | |
256 Arg [1] : string $type | |
257 Example : do_something() if($seq_dumper->is_enabled('gene')); | |
258 Description: checks if a specific feature type is enabled | |
259 Returntype : none | |
260 Exceptions : warning if invalid type is passed, | |
261 thrown if no type is passed | |
262 Caller : general | |
263 | |
264 =cut | |
265 | |
266 sub is_enabled { | |
267 my ($self, $type) = @_; | |
268 | |
269 $type || throw("type arg is required"); | |
270 | |
271 if(exists($self->{'feature_types'}->{$type})) { | |
272 return $self->{'feature_types'}->{$type}; | |
273 } else { | |
274 warning("unknown feature type '$type'\n" . | |
275 "valid types are: " . join(',', keys %{$self->{'feature_types'}})); | |
276 } | |
277 } | |
278 | |
279 | |
280 =head2 dump | |
281 | |
282 Arg [1] : Bio::EnsEMBL::Slice slice | |
283 The slice to dump | |
284 Arg [2] : string $format | |
285 The name of the format to dump | |
286 Arg [3] : (optional) $outfile | |
287 The name of the file to dump to. If no file is specified STDOUT | |
288 is used | |
289 Arg [4] : (optional) $seq | |
290 Sequence to dump | |
291 Arg [4] : (optional) $no_append | |
292 Default action is to open the file in append mode. This will | |
293 turn that mode off | |
294 Example : $seq_dumper->dump($slice, 'EMBL'); | |
295 Description: Dumps a region of a genome specified by the slice argument into | |
296 an outfile of the format $format | |
297 Returntype : none | |
298 Exceptions : thrown if slice or format args are not supplied | |
299 Caller : general | |
300 | |
301 =cut | |
302 | |
303 | |
304 sub dump { | |
305 my ($self, $slice, $format, $outfile, $seq, $no_append) = @_; | |
306 | |
307 $format || throw("format arg is required"); | |
308 $slice || throw("slice arg is required"); | |
309 | |
310 my $dump_handler = $DUMP_HANDLERS->{uc($format)}; | |
311 | |
312 unless($dump_handler) { | |
313 throw("No dump handler is defined for format $format\n"); | |
314 } | |
315 | |
316 | |
317 my $FH = IO::File->new;; | |
318 if($outfile) { | |
319 my $mode = ($no_append) ? '>' : '>>'; | |
320 $FH->open("${mode}${outfile}") or throw("Could not open file $outfile: $!"); | |
321 } else { | |
322 $FH = \*STDOUT; | |
323 #mod_perl did not like the following | |
324 #$FH->fdopen(fileno(STDOUT), "w") | |
325 # or throw("Could not open currently selected output filehandle " . | |
326 # "for writing"); | |
327 } | |
328 | |
329 &$dump_handler($self, $slice, $FH, $seq); | |
330 | |
331 $FH->close if ($outfile); #close if we were writing to a file | |
332 } | |
333 | |
334 =head2 dump_embl | |
335 | |
336 Arg [1] : Bio::EnsEMBL::Slice | |
337 Arg [2] : IO::File $FH | |
338 Arg [3] : optional sequence string | |
339 Example : $seq_dumper->dump_embl($slice, $FH); | |
340 Description: Dumps an EMBL flat file to an open file handle | |
341 Returntype : none | |
342 Exceptions : none | |
343 Caller : dump | |
344 | |
345 =cut | |
346 | |
347 sub dump_embl { | |
348 my $self = shift; | |
349 my $slice = shift; | |
350 my $FH = shift; | |
351 my $SEQ = shift; | |
352 | |
353 my $len = $slice->length; | |
354 | |
355 my $version; | |
356 my $acc; | |
357 | |
358 my $cs = $slice->coord_system(); | |
359 my $name_str = $cs->name() . ' ' . $slice->seq_region_name(); | |
360 $name_str .= ' ' . $cs->version if($cs->version); | |
361 | |
362 my $start = $slice->start; | |
363 my $end = $slice->end; | |
364 | |
365 #determine if this slice is the entire seq region | |
366 #if it is then we just use the name as the id | |
367 my $slice_adaptor = $slice->adaptor(); | |
368 my $full_slice = | |
369 $slice->adaptor->fetch_by_region($cs->name, | |
370 $slice->seq_region_name, | |
371 undef,undef,undef, | |
372 $cs->version); | |
373 | |
374 | |
375 my $entry_name = $slice->seq_region_name(); | |
376 | |
377 | |
378 | |
379 if($full_slice->name eq $slice->name) { | |
380 $name_str .= ' full sequence'; | |
381 $acc = $slice->seq_region_name(); | |
382 my @acc_ver = split(/\./, $acc); | |
383 if(@acc_ver == 2) { | |
384 $acc = $acc_ver[0]; | |
385 $version = $acc_ver[0] . '.'. $acc_ver[1]; | |
386 } elsif(@acc_ver == 1 && $cs->version()) { | |
387 $version = $acc . '.'. $cs->version(); | |
388 } else { | |
389 $version = $acc; | |
390 } | |
391 } else { | |
392 $name_str .= ' partial sequence'; | |
393 $acc = $slice->name(); | |
394 $version = $acc; | |
395 } | |
396 | |
397 $acc = $slice->name(); | |
398 | |
399 | |
400 | |
401 #line breaks are allowed near the end of the line on ' ', "\t", "\n", ',' | |
402 $: = (" \t\n-,"); | |
403 | |
404 ############# | |
405 # dump header | |
406 ############# | |
407 | |
408 my $EMBL_HEADER = | |
409 '@< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~ | |
410 '; | |
411 | |
412 #ID and moltype | |
413 # HTG = High Throughput Genome division, probably most suitable | |
414 # and it would be hard to come up with another appropriate division | |
415 # that worked for all organisms (e.g. plants are in PLN but human is | |
416 # in HUM). | |
417 my $VALUE = "$entry_name standard; DNA; HTG; $len BP."; | |
418 $self->write($FH, $EMBL_HEADER, 'ID', $VALUE); | |
419 $self->print( $FH, "XX\n" ); | |
420 | |
421 #Accession | |
422 $self->write($FH, $EMBL_HEADER, 'AC', $acc); | |
423 $self->print( $FH, "XX\n" ); | |
424 | |
425 #Version | |
426 $self->write($FH, $EMBL_HEADER, 'SV', $version); | |
427 $self->print( $FH, "XX\n" ); | |
428 | |
429 #Date | |
430 $self->write($FH, $EMBL_HEADER, 'DT', $self->_date_string); | |
431 $self->print( $FH, "XX\n" ); | |
432 | |
433 my $meta_container = $slice->adaptor()->db()->get_MetaContainer(); | |
434 | |
435 #Description | |
436 $self->write($FH, $EMBL_HEADER, 'DE', $meta_container->get_scientific_name() . | |
437 " $name_str $start..$end annotated by Ensembl"); | |
438 $self->print( $FH, "XX\n" ); | |
439 | |
440 #key words | |
441 $self->write($FH, $EMBL_HEADER, 'KW', '.'); | |
442 $self->print( $FH, "XX\n" ); | |
443 | |
444 #Species | |
445 my $species_name = $meta_container->get_scientific_name(); | |
446 if(my $cn = $meta_container->get_common_name()) { | |
447 $species_name .= " ($cn)"; | |
448 } | |
449 | |
450 $self->write($FH, $EMBL_HEADER, 'OS', $species_name); | |
451 | |
452 #Classification | |
453 my $cls = $meta_container->get_classification(); | |
454 $self->write($FH, $EMBL_HEADER, 'OC', join('; ', reverse(@{$cls})) . '.'); | |
455 $self->print( $FH, "XX\n" ); | |
456 | |
457 #References (we are not dumping refereneces) | |
458 | |
459 #Database References (we are not dumping these) | |
460 | |
461 #comments | |
462 foreach my $comment (@COMMENTS) { | |
463 $self->write($FH, $EMBL_HEADER, 'CC', $comment); | |
464 $self->print( $FH, "XX\n" ); | |
465 } | |
466 | |
467 #################### | |
468 #DUMP FEATURE TABLE | |
469 #################### | |
470 $self->print( $FH, "FH Key Location/Qualifiers\n" ); | |
471 | |
472 my $FEATURE_TABLE = | |
473 'FT ^<<<<<<<<<<<<<<<^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~ | |
474 '; | |
475 $self->_dump_feature_table($slice, $FH, $FEATURE_TABLE); | |
476 | |
477 #write an XX after the feature tables | |
478 $self->print( $FH, "XX\n" ); | |
479 | |
480 ################### | |
481 #DUMP SEQUENCE | |
482 ################### | |
483 | |
484 if(!defined($SEQ)){ | |
485 $SEQ = $slice->seq(); | |
486 } | |
487 # my $SEQ = $slice->seq(); | |
488 my $length = length($SEQ); | |
489 my $a_count = $SEQ =~ tr/aA/aA/; | |
490 my $c_count = $SEQ =~ tr/cC/cC/; | |
491 my $t_count = $SEQ =~ tr/tT/tT/; | |
492 my $g_count = $SEQ =~ tr/gG/gG/; | |
493 my $other_count = $length - $a_count - $c_count - $t_count - $g_count; | |
494 | |
495 my $value = "Sequence $length BP; $a_count A; $c_count C; " . | |
496 "$g_count G; $t_count T; $other_count other;"; | |
497 $self->print($FH, 'SQ '.$value."\n"); | |
498 | |
499 $self->write_embl_seq($FH, \$SEQ); | |
500 | |
501 | |
502 $self->print( $FH, "//\n" ); | |
503 | |
504 # Set formatting back to normal | |
505 $: = " \n-"; | |
506 } | |
507 | |
508 | |
509 | |
510 | |
511 =head2 dump_genbank | |
512 | |
513 Arg [1] : Bio::EnsEMBL::Slice | |
514 Arg [2] : IO::File $FH | |
515 Example : $seq_dumper->dump_genbank($slice, $FH); | |
516 Description: Dumps a GENBANK flat file to an open file handle | |
517 Returntype : none | |
518 Exceptions : none | |
519 Caller : dump | |
520 | |
521 =cut | |
522 | |
523 sub dump_genbank { | |
524 my ($self, $slice, $FH, $SEQ) = @_; | |
525 | |
526 #line breaks are allowed near the end of the line on ' ', "\t", "\n", ',' | |
527 $: = " \t\n-,"; | |
528 | |
529 my $GENBANK_HEADER = | |
530 '^<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ | |
531 '; | |
532 | |
533 my $GENBANK_SUBHEADER = | |
534 ' ^<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ | |
535 '; | |
536 | |
537 my $GENBANK_FT = | |
538 ' ^<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ | |
539 '; | |
540 | |
541 my $version; | |
542 my $acc; | |
543 | |
544 my $cs = $slice->coord_system(); | |
545 | |
546 my $name_str = $cs->name() . ' ' . $slice->seq_region_name(); | |
547 | |
548 $name_str .= ' ' . $cs->version if($cs->version); | |
549 | |
550 #determine if this slice is the entire seq region | |
551 #if it is then we just use the name as the id | |
552 my $slice_adaptor = $slice->adaptor(); | |
553 my $full_slice = | |
554 $slice->adaptor->fetch_by_region($cs->name, | |
555 $slice->seq_region_name, | |
556 undef,undef,undef, | |
557 $cs->version); | |
558 | |
559 | |
560 my $entry_name = $slice->seq_region_name(); | |
561 | |
562 if($full_slice->name eq $slice->name) { | |
563 $name_str .= ' full sequence'; | |
564 $acc = $slice->seq_region_name(); | |
565 my @acc_ver = split(/\./, $acc); | |
566 if(@acc_ver == 2) { | |
567 $acc = $acc_ver[0]; | |
568 $version = $acc_ver[0] . $acc_ver[1]; | |
569 } elsif(@acc_ver == 1 && $cs->version()) { | |
570 $version = $acc . $cs->version(); | |
571 } else { | |
572 $version = $acc; | |
573 } | |
574 } else { | |
575 $name_str .= ' partial sequence'; | |
576 $acc = $slice->name(); | |
577 $version = $acc; | |
578 } | |
579 | |
580 $acc = $slice->name(); # to keep format consistent for all | |
581 | |
582 my $length = $slice->length; | |
583 my $start = $slice->start(); | |
584 my $end = $slice->end(); | |
585 | |
586 my $date = $self->_date_string; | |
587 | |
588 my $meta_container = $slice->adaptor()->db()->get_MetaContainer(); | |
589 | |
590 #LOCUS | |
591 my $tag = 'LOCUS'; | |
592 my $value = "$entry_name $length bp DNA HTG $date"; | |
593 $self->write($FH, $GENBANK_HEADER, $tag, $value); | |
594 | |
595 #DEFINITION | |
596 $tag = "DEFINITION"; | |
597 $value = $meta_container->get_scientific_name() . | |
598 " $name_str $start..$end reannotated via EnsEMBL"; | |
599 $self->write($FH, $GENBANK_HEADER, $tag, $value); | |
600 | |
601 #ACCESSION | |
602 $self->write($FH, $GENBANK_HEADER, 'ACCESSION', $acc); | |
603 | |
604 #VERSION | |
605 $self->write($FH, $GENBANK_HEADER, 'VERSION', $version); | |
606 | |
607 # KEYWORDS | |
608 $self->write($FH, $GENBANK_HEADER, 'KEYWORDS', '.'); | |
609 | |
610 # SOURCE | |
611 my $common_name = $meta_container->get_common_name(); | |
612 $common_name = $meta_container->get_scientific_name() unless $common_name; | |
613 $self->write($FH, $GENBANK_HEADER, 'SOURCE', $common_name); | |
614 | |
615 #organism | |
616 my @cls = $meta_container->get_classification(); | |
617 shift @cls; | |
618 $self->write($FH, $GENBANK_SUBHEADER, 'ORGANISM', $meta_container->get_scientific_name()); | |
619 $self->write($FH, $GENBANK_SUBHEADER, '', join('; ', reverse @cls) . "."); | |
620 | |
621 #refereneces | |
622 | |
623 #comments | |
624 foreach my $comment (@COMMENTS) { | |
625 $self->write($FH, $GENBANK_HEADER, 'COMMENT', $comment); | |
626 } | |
627 | |
628 #################### | |
629 # DUMP FEATURE TABLE | |
630 #################### | |
631 $self->print( $FH, "FEATURES Location/Qualifiers\n" ); | |
632 $self->_dump_feature_table($slice, $FH, $GENBANK_FT); | |
633 | |
634 #################### | |
635 # DUMP SEQUENCE | |
636 #################### | |
637 | |
638 if(!defined($SEQ)){ | |
639 $SEQ = $slice->seq(); | |
640 } | |
641 # my $SEQ = $slice->seq(); | |
642 my $a_count = $SEQ =~ tr/aA/aA/; | |
643 my $c_count = $SEQ =~ tr/cC/cC/; | |
644 my $t_count = $SEQ =~ tr/tT/tT/; | |
645 my $g_count = $SEQ =~ tr/gG/gG/; | |
646 my $bp_length = length($SEQ); | |
647 my $other_count = $bp_length - $a_count - $c_count - $t_count - $g_count; | |
648 | |
649 $tag = 'BASE COUNT'; | |
650 $value = "$a_count a $c_count c $g_count g $t_count t"; | |
651 $value .= " $other_count n" if($other_count); | |
652 $self->print($FH, qq{$tag $value\n}); | |
653 $self->print( $FH, "ORIGIN\n" ); | |
654 | |
655 $self->write_genbank_seq($FH, \$SEQ); | |
656 | |
657 $self->print( $FH, "//\n" ); | |
658 | |
659 # Set formatting back to normal | |
660 $: = " \n-"; | |
661 } | |
662 | |
663 | |
664 | |
665 =head2 _dump_feature_table | |
666 | |
667 Arg [1] : Bio::EnsEMBL::Slice slice | |
668 Example : none | |
669 Description: Helper method used to dump feature tables used in EMBL, FASTA, | |
670 GENBANK. Assumes formating of file handle has been setup | |
671 already to use $FEAT and $VALUE values. | |
672 Returntype : none | |
673 Exceptions : none | |
674 Caller : internal | |
675 | |
676 =cut | |
677 | |
678 sub _dump_feature_table { | |
679 my $self = shift; | |
680 my $slice = shift; | |
681 my $FH = shift; | |
682 my $FORMAT = shift; | |
683 | |
684 #use only the core database to dump features (except for bloody snps) | |
685 my $lite = $slice->adaptor->db->remove_db_adaptor('lite'); | |
686 | |
687 my $meta = $slice->adaptor->db->get_MetaContainer; | |
688 | |
689 #lump file handle and format string together for simpler method calls | |
690 my @ff = ($FH, $FORMAT); | |
691 my $value; | |
692 | |
693 #source | |
694 my $classification = join(', ', $meta->get_classification()); | |
695 $self->write(@ff,'source', "1.." . $slice->length()); | |
696 $self->write(@ff,'' , '/organism="'.$meta->get_scientific_name(). '"'); | |
697 $self->write(@ff,'' , '/db_xref="taxon:'.$meta->get_taxonomy_id().'"'); | |
698 | |
699 # | |
700 # Transcripts & Genes | |
701 # | |
702 my @gene_slices; | |
703 if($self->is_enabled('gene')) { | |
704 push @gene_slices, $slice; | |
705 } | |
706 | |
707 # Retrieve slices of other database where we need to pull genes from | |
708 | |
709 my $gene_dbs = {'vegagene' => 'vega', | |
710 'estgene' => 'estgene'}; | |
711 | |
712 foreach my $gene_type (keys %$gene_dbs) { | |
713 if($self->is_enabled($gene_type)) { | |
714 my $db = $self->get_database($gene_dbs->{$gene_type}); | |
715 if($db) { | |
716 my $sa = $db->get_SliceAdaptor(); | |
717 push @gene_slices, $sa->fetch_by_name($slice->name()); | |
718 } else { | |
719 warning("A [". $gene_dbs->{$gene_type} ."] database must be " . | |
720 "attached to this SeqDumper\n(via a call to " . | |
721 "attach_database) to retrieve genes of type [$gene_type]"); | |
722 } | |
723 } | |
724 } | |
725 | |
726 foreach my $gene_slice (@gene_slices) { | |
727 my @genes = @{$gene_slice->get_all_Genes(undef,undef, 1)}; | |
728 while(my $gene = shift @genes) { | |
729 $value = $self->features2location( [$gene] ); | |
730 $self->write( @ff, 'gene', $value ); | |
731 $self->write( @ff, "", '/gene='.$gene->stable_id() ); | |
732 | |
733 | |
734 if(defined($gene->display_xref)){ | |
735 $self->write( @ff, "",'/locus_tag="'.$gene->display_xref->display_id.'"'); | |
736 } | |
737 my $desc = $gene->description; | |
738 if(defined($desc) and $desc ne ""){ | |
739 $desc =~ s/\"//; | |
740 $self->write( @ff, "", '/note="'.$gene->description.'"'); | |
741 } | |
742 | |
743 | |
744 | |
745 foreach my $transcript (@{$gene->get_all_Transcripts}) { | |
746 my $translation = $transcript->translation; | |
747 | |
748 # normal transcripts get dumped differently than pseudogenes | |
749 if($translation) { | |
750 #normal transcript | |
751 $value = $self->features2location($transcript->get_all_Exons); | |
752 $self->write(@ff, 'mRNA', $value); | |
753 $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"'); | |
754 $self->write(@ff,'' | |
755 ,'/note="transcript_id='.$transcript->stable_id().'"'); | |
756 | |
757 # ...and a CDS section | |
758 $value = | |
759 $self->features2location($transcript->get_all_translateable_Exons); | |
760 $self->write(@ff,'CDS', $value); | |
761 my $codon_start = $self->transcript_to_codon_start($transcript); | |
762 $self->write(@ff,'' , qq{/codon_start="${codon_start}"}) if $codon_start > 1; | |
763 $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"'); | |
764 $self->write(@ff,'' , '/protein_id="'.$translation->stable_id().'"'); | |
765 $self->write(@ff,'' ,'/note="transcript_id='.$transcript->stable_id().'"'); | |
766 | |
767 foreach my $dbl (@{$transcript->get_all_DBLinks}) { | |
768 $value = '/db_xref="'.$dbl->dbname().':'.$dbl->display_id().'"'; | |
769 $self->write(@ff, '', $value); | |
770 } | |
771 | |
772 $value = '/translation="'.$transcript->translate()->seq().'"'; | |
773 $self->write(@ff, '', $value); | |
774 } else { | |
775 #pseudogene | |
776 $value = $self->features2location($transcript->get_all_Exons); | |
777 $self->write(@ff, 'misc_RNA', $value); | |
778 $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"'); | |
779 foreach my $dbl (@{$transcript->get_all_DBLinks}) { | |
780 $value = '/db_xref="'.$dbl->dbname().':'.$dbl->primary_id().'"'; | |
781 $self->write(@ff, '', $value); | |
782 } | |
783 $self->write(@ff,'' , '/note="'.$transcript->biotype().'"'); | |
784 $self->write(@ff,'' | |
785 ,'/note="transcript_id='.$transcript->stable_id().'"'); | |
786 } | |
787 } | |
788 } | |
789 | |
790 # exons | |
791 foreach my $gene (@{$gene_slice->get_all_Genes(undef,undef,1)}) { | |
792 foreach my $exon (@{$gene->get_all_Exons}) { | |
793 $self->write(@ff,'exon', $self->features2location([$exon])); | |
794 $self->write(@ff,'' , '/note="exon_id='.$exon->stable_id().'"'); | |
795 } | |
796 } | |
797 } | |
798 | |
799 # | |
800 # genscans | |
801 # | |
802 if($self->is_enabled('genscan')) { | |
803 my @genscan_exons; | |
804 my @transcripts = @{$slice->get_all_PredictionTranscripts(undef,1)}; | |
805 while(my $transcript = shift @transcripts) { | |
806 my $exons = $transcript->get_all_Exons(); | |
807 push @genscan_exons, @$exons; | |
808 $self->write(@ff, 'mRNA', $self->features2location($exons)); | |
809 $self->write(@ff, '', '/product="'.$transcript->translate()->seq().'"'); | |
810 $self->write(@ff, '', '/note="identifier='.$transcript->stable_id.'"'); | |
811 $self->write(@ff, '', '/note="Derived by automated computational' . | |
812 ' analysis using gene prediction method:' . | |
813 $transcript->analysis->logic_name . '"'); | |
814 } | |
815 } | |
816 | |
817 # | |
818 # snps | |
819 # | |
820 if($self->is_enabled('variation') && $slice->can('get_all_VariationFeatures')) { | |
821 # $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite; | |
822 | |
823 my @variations = @{$slice->get_all_VariationFeatures()}; | |
824 while(my $snp = shift @variations) { | |
825 my $ss = $snp->start; | |
826 my $se = $snp->end; | |
827 #skip snps that hang off edge of slice | |
828 next if($ss < 1 || $se > $slice->length); | |
829 | |
830 $self->write(@ff, 'variation', "$ss..$se"); | |
831 $self->write(@ff, '' , '/replace="'.$snp->allele_string.'"'); | |
832 #$self->write(@ff, '' , '/evidence="'.$snp->status.'"'); | |
833 my $rs_id = $snp->variation_name(); | |
834 my $db = $snp->source(); | |
835 # foreach my $link ($snp->each_DBLink) { | |
836 # my $id = $link->primary_id; | |
837 # my $db = $link->database; | |
838 $self->write(@ff, '', "/db_xref=\"$db:$rs_id\""); | |
839 # } | |
840 } | |
841 | |
842 # $slice->adaptor->db->remove_db_adaptor('lite') if $lite; | |
843 } | |
844 | |
845 # | |
846 # similarity features | |
847 # | |
848 if($self->is_enabled('similarity')) { | |
849 foreach my $sim (@{$slice->get_all_SimilarityFeatures}) { | |
850 $self->write(@ff, 'misc_feature', $self->features2location([$sim])); | |
851 $self->write(@ff, '' , '/note="match: '.$sim->hseqname. | |
852 ' : '.$sim->hstart.'..'.$sim->hend.'('.$sim->hstrand.')"'); | |
853 } | |
854 } | |
855 | |
856 # | |
857 # repeats | |
858 # | |
859 if($self->is_enabled('repeat')) { | |
860 my @rfs = @{$slice->get_all_RepeatFeatures()}; | |
861 | |
862 while(my $repeat = shift @rfs) { | |
863 $self->write(@ff, 'repeat_region', $self->features2location([$repeat])); | |
864 $self->write(@ff, '' , '/note="' . $repeat->repeat_consensus->name. | |
865 ' repeat: matches ' . $repeat->hstart.'..'.$repeat->hend . | |
866 '('.$repeat->hstrand.') of consensus"'); | |
867 } | |
868 | |
869 } | |
870 | |
871 # | |
872 # markers | |
873 # | |
874 if($self->is_enabled('marker') && $slice->can('get_all_MarkerFeatures')) { | |
875 my @markers = @{$slice->get_all_MarkerFeatures()}; | |
876 while(my $mf = shift @markers) { | |
877 $self->write(@ff, 'STS', $self->features2location([$mf])); | |
878 if($mf->marker->display_MarkerSynonym) { | |
879 $self->write(@ff, '' , '/standard_name="' . | |
880 $mf->marker->display_MarkerSynonym->name . '"'); | |
881 } | |
882 | |
883 | |
884 #grep out synonyms without a source | |
885 my @synonyms = @{$mf->marker->get_all_MarkerSynonyms}; | |
886 @synonyms = grep {$_->source } @synonyms; | |
887 foreach my $synonym (@synonyms) { | |
888 $self->write(@ff, '', '/db_xref="'.$synonym->source. | |
889 ':'.$synonym->name.'"'); | |
890 } | |
891 $self->write(@ff, '', '/note="map_weight='.$mf->map_weight.'"'); | |
892 } | |
893 } | |
894 | |
895 # | |
896 # contigs | |
897 # | |
898 if($self->is_enabled('contig')) { | |
899 foreach my $segment (@{$slice->project('seqlevel')}) { | |
900 my ($start, $end, $slice) = @$segment; | |
901 $self->write(@ff, 'misc_feature', | |
902 $start .'..'. $end); | |
903 $self->write(@ff, '', '/note="contig '.$slice->seq_region_name . | |
904 ' ' . $slice->start . '..' . $slice->end . | |
905 '(' . $slice->strand . ')"'); | |
906 } | |
907 } | |
908 | |
909 $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite; | |
910 | |
911 } | |
912 | |
913 # /codon_start= is the first base to start translating from. This maps | |
914 # Ensembl start exon phase to this concept. Here we present the usage | |
915 # of phase in this concept where each row shows the sequence the | |
916 # spliced_seq() method will return | |
917 | |
918 # 123456789 | |
919 # ATTATGACG | |
920 # Phase == 0 ...+++### codon_start=0 // start from 1st A | |
921 # Phase == 1 ..+++### codon_start=3 // start from 2nd A (base 3 in the given spliced sequence) | |
922 # Phase == 2 .+++### codon_start=2 // start from 2nd A (base 2 in the spliced sequence) | |
923 # | |
924 # In the case of the final 2 phases we will generate a X codon | |
925 # | |
926 | |
927 sub transcript_to_codon_start { | |
928 my ($self, $transcript) = @_; | |
929 my $start_phase = $transcript->start_Exon()->phase(); | |
930 return ( $start_phase == 1 ) ? 3 : | |
931 ( $start_phase == 2 ) ? 2 : | |
932 ( $start_phase == 0 ) ? 1 : | |
933 -1; | |
934 } | |
935 | |
936 | |
937 =head2 dump_fasta | |
938 | |
939 Arg [1] : Bio::EnsEMBL::Slice | |
940 Arg [2] : IO::File $FH | |
941 Example : $seq_dumper->dump_fasta($slice, $FH); | |
942 Description: Dumps an FASTA flat file to an open file handle | |
943 Returntype : none | |
944 Exceptions : none | |
945 Caller : dump | |
946 | |
947 =cut | |
948 | |
949 sub dump_fasta { | |
950 my $self = shift; | |
951 my $slice = shift; | |
952 my $FH = shift; | |
953 | |
954 my $id = $slice->seq_region_name; | |
955 my $seqtype = 'dna'; | |
956 my $idtype = $slice->coord_system->name; | |
957 my $location = $slice->name; | |
958 my $start = 1; | |
959 my $end = $slice->length(); | |
960 | |
961 my $header = ">$id $seqtype:$idtype $location\n"; | |
962 $self->print( $FH, $header ); | |
963 | |
964 #set the formatting to FASTA | |
965 my $FORMAT = '^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< | |
966 '; | |
967 | |
968 #chunk the sequence in 60kb chunks to use less memory | |
969 my $cur = $start; | |
970 while($cur <= $end) { | |
971 my $to = $cur + 59_999; | |
972 $to = $end if($to > $end); | |
973 my $seq = $slice->subseq($cur, $to); | |
974 $cur = $to + 1; | |
975 $self->write($FH, $FORMAT, $seq); | |
976 } | |
977 } | |
978 | |
979 | |
980 | |
981 =head2 features2location | |
982 | |
983 Arg [1] : listref of Bio::EnsEMBL::SeqFeatures | |
984 Example : $location = $self->features2location(\@features); | |
985 Description: Constructs an EMBL location string from a list of features | |
986 Returntype : string | |
987 Exceptions : none | |
988 Caller : internal | |
989 | |
990 =cut | |
991 | |
992 sub features2location { | |
993 my $self = shift; | |
994 my $features = shift; | |
995 | |
996 my @join = (); | |
997 | |
998 foreach my $f (@$features) { | |
999 my $slice = $f->slice; | |
1000 my $start = $f->start(); | |
1001 my $end = $f->end(); | |
1002 my $strand = $f->strand(); | |
1003 | |
1004 if($start >= 1 && $end <= $slice->length) { | |
1005 #this feature in on a slice and doesn't lie outside the boundary | |
1006 | |
1007 if($strand == 1) { | |
1008 push @join, "$start..$end"; | |
1009 } else { | |
1010 push @join, "complement($start..$end)"; | |
1011 } | |
1012 } else { | |
1013 my @fs = (); | |
1014 #this feature is outside the boundary of the dump, | |
1015 # yet implemented and 'seqlevel' is guaranteed to be 1step | |
1016 my $projection = $f->project('seqlevel'); | |
1017 foreach my $segment (@$projection) { | |
1018 my $slice = $segment->[2]; | |
1019 my $slc_start = $slice->start(); | |
1020 my $slc_end = $slice->end(); | |
1021 my $seq_reg = $slice->seq_region_name(); | |
1022 if($slice->strand == 1) { | |
1023 push @join, "$seq_reg:$slc_start..$slc_end"; | |
1024 } else { | |
1025 push @join, "complement($seq_reg:$slc_start..$slc_end)"; | |
1026 } | |
1027 } | |
1028 } | |
1029 } | |
1030 | |
1031 my $out = join ',', @join; | |
1032 | |
1033 if(scalar @join > 1) { | |
1034 $out = "join($out)"; | |
1035 } | |
1036 | |
1037 return $out; | |
1038 } | |
1039 | |
1040 | |
1041 sub _date_string { | |
1042 my $self = shift; | |
1043 | |
1044 my ($sec, $min, $hour, $mday,$mon, $year) = localtime(time()); | |
1045 | |
1046 my $month = ('JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', | |
1047 'AUG', 'SEP', 'OCT', 'NOV', 'DEC')[$mon]; | |
1048 $year += 1900; | |
1049 | |
1050 return "$mday-$month-$year"; | |
1051 } | |
1052 | |
1053 | |
1054 sub write { | |
1055 my ($self, $FH, $FORMAT, @values) = @_; | |
1056 | |
1057 #while the last value still contains something | |
1058 while(defined($values[-1]) and $values[-1] ne '') { | |
1059 formline($FORMAT, @values); | |
1060 $self->print( $FH, $^A ); | |
1061 $^A = ''; | |
1062 } | |
1063 } | |
1064 | |
1065 sub write_genbank_seq { | |
1066 my $self = shift; | |
1067 my $FH = shift; | |
1068 my $seq = shift; | |
1069 my $base_total = shift; | |
1070 | |
1071 $base_total ||= 0; | |
1072 | |
1073 my $GENBANK_SEQ = | |
1074 '@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~ | |
1075 '; | |
1076 | |
1077 my $total = -59 + $base_total; | |
1078 #keep track of total and print lines of 60 bases with spaces every 10bp | |
1079 while($$seq) { | |
1080 $total += 60; | |
1081 formline($GENBANK_SEQ,$total, $$seq, $$seq, $$seq, $$seq, $$seq, $$seq); | |
1082 $self->print( $FH, $^A ); | |
1083 $^A = ''; | |
1084 } | |
1085 } | |
1086 | |
1087 sub write_embl_seq { | |
1088 my $self = shift; | |
1089 my $FH = shift; | |
1090 my $seq = shift; | |
1091 my $base_total = shift; | |
1092 | |
1093 $base_total ||= 0; | |
1094 | |
1095 my $EMBL_SEQ = | |
1096 ' ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<@>>>>>>>>>~ | |
1097 '; | |
1098 #keep track of total and print lines of 60 bases with spaces every 10bp | |
1099 my $length = length($$seq); | |
1100 my $total = $length - $base_total; | |
1101 while($$seq) { | |
1102 $total -= 60; | |
1103 $total = 0 if($total < 0); | |
1104 formline($EMBL_SEQ, | |
1105 $$seq, $$seq, $$seq, $$seq, $$seq, $$seq, | |
1106 $length - $total); | |
1107 $self->print( $FH, $^A ); | |
1108 $^A = ''; | |
1109 } | |
1110 } | |
1111 | |
1112 sub print { | |
1113 my( $self, $FH, $string ) = @_; | |
1114 if(!print $FH $string){ | |
1115 print STDERR "Problem writing to disk\n"; | |
1116 print STDERR "the string is $string\n"; | |
1117 die "Could not write to file handle"; | |
1118 } | |
1119 } | |
1120 | |
1121 1; |