comparison variant_effect_predictor/Bio/SeqIO/embl.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: embl.pm,v 1.57.2.6 2003/09/14 19:06:51 jason Exp $
2 #
3 # BioPerl module for Bio::SeqIO::EMBL
4 #
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
6 #
7 # Copyright Ewan Birney
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::SeqIO::embl - EMBL sequence input/output stream
16
17 =head1 SYNOPSIS
18
19 It is probably best not to use this object directly, but
20 rather go through the AnnSeqIO handler system. Go:
21
22 $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
23
24 while ( (my $seq = $stream->next_seq()) ) {
25 # do something with $seq
26 }
27
28 =head1 DESCRIPTION
29
30 This object can transform Bio::Seq objects to and from EMBL flat
31 file databases.
32
33 There is alot of flexibility here about how to dump things which I need
34 to document fully.
35
36 There should be a common object that this and genbank share (probably
37 with swissprot). Too much of the magic is identical.
38
39 =head2 Optional functions
40
41 =over 3
42
43 =item _show_dna()
44
45 (output only) shows the dna or not
46
47 =item _post_sort()
48
49 (output only) provides a sorting func which is applied to the FTHelpers
50 before printing
51
52 =item _id_generation_func()
53
54 This is function which is called as
55
56 print "ID ", $func($annseq), "\n";
57
58 To generate the ID line. If it is not there, it generates a sensible ID
59 line using a number of tools.
60
61 If you want to output annotations in embl format they need to be
62 stored in a Bio::Annotation::Collection object which is accessible
63 through the Bio::SeqI interface method L<annotation()|annotation>.
64
65 The following are the names of the keys which are polled from a
66 L<Bio::Annotation::Collection> object.
67
68 reference - Should contain Bio::Annotation::Reference objects
69 comment - Should contain Bio::Annotation::Comment objects
70 dblink - Should contain Bio::Annotation::DBLink objects
71
72 =back
73
74 =head1 FEEDBACK
75
76 =head2 Mailing Lists
77
78 User feedback is an integral part of the evolution of this
79 and other Bioperl modules. Send your comments and suggestions preferably
80 to one of the Bioperl mailing lists.
81 Your participation is much appreciated.
82
83 bioperl-l@bioperl.org - General discussion
84 http://www.bioperl.org/MailList.shtml - About the mailing lists
85
86 =head2 Reporting Bugs
87
88 Report bugs to the Bioperl bug tracking system to help us keep track
89 the bugs and their resolution.
90 Bug reports can be submitted via email or the web:
91
92 bioperl-bugs@bio.perl.org
93 http://bugzilla.bioperl.org/
94
95 =head1 AUTHOR - Ewan Birney
96
97 Email birney@ebi.ac.uk
98
99 Describe contact details here
100
101 =head1 APPENDIX
102
103 The rest of the documentation details each of the object
104 methods. Internal methods are usually preceded with a _
105
106 =cut
107
108
109 # Let the code begin...
110
111
112 package Bio::SeqIO::embl;
113 use vars qw(@ISA);
114 use strict;
115 use Bio::SeqIO::FTHelper;
116 use Bio::SeqFeature::Generic;
117 use Bio::Species;
118 use Bio::Seq::SeqFactory;
119 use Bio::Annotation::Collection;
120 use Bio::Annotation::Comment;
121 use Bio::Annotation::Reference;
122 use Bio::Annotation::DBLink;
123
124 @ISA = qw(Bio::SeqIO);
125
126 sub _initialize {
127 my($self,@args) = @_;
128
129 $self->SUPER::_initialize(@args);
130 # hash for functions for decoding keys.
131 $self->{'_func_ftunit_hash'} = {};
132 $self->_show_dna(1); # sets this to one by default. People can change it
133 if( ! defined $self->sequence_factory ) {
134 $self->sequence_factory(new Bio::Seq::SeqFactory
135 (-verbose => $self->verbose(),
136 -type => 'Bio::Seq::RichSeq'));
137 }
138 }
139
140 =head2 next_seq
141
142 Title : next_seq
143 Usage : $seq = $stream->next_seq()
144 Function: returns the next sequence in the stream
145 Returns : Bio::Seq object
146 Args :
147
148 =cut
149
150 sub next_seq {
151 my ($self,@args) = @_;
152 my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
153 $date, $comment, @date_arr);
154
155 my ($annotation, %params, @features) = ( new Bio::Annotation::Collection);
156
157 $line = $self->_readline; # This needs to be before the first eof() test
158
159 if( !defined $line ) {
160 return undef; # no throws - end of file
161 }
162
163 if( $line =~ /^\s+$/ ) {
164 while( defined ($line = $self->_readline) ) {
165 $line =~/^\S/ && last;
166 }
167 }
168 if( !defined $line ) {
169 return undef; # end of file
170 }
171 $line =~ /^ID\s+\S+/ || $self->throw("EMBL stream with no ID. Not embl in my book");
172 $line =~ /^ID\s+(\S+)\s+\S+\;\s+([^;]+)\;\s+(\S+)\;/;
173 $name = $1;
174 $mol = $2;
175 $div = $3;
176 if(! $name) {
177 $name = "unknown id";
178 }
179 my $alphabet;
180
181 # this is important to have the id for display in e.g. FTHelper, otherwise
182 # you won't know which entry caused an error
183 if($mol) {
184 if ( $mol =~ /circular/ ) {
185 $params{'-is_circular'} = 1;
186 $mol =~ s|circular ||;
187 }
188 if (defined $mol ) {
189 if ($mol =~ /DNA/) {
190 $alphabet='dna';
191 }
192 elsif ($mol =~ /RNA/) {
193 $alphabet='rna';
194 }
195 elsif ($mol =~ /AA/) {
196 $alphabet='protein';
197 }
198 }
199
200 }
201
202 # $self->warn("not parsing upper annotation in EMBL file yet!");
203 my $buffer = $line;
204
205 local $_;
206
207 BEFORE_FEATURE_TABLE :
208 until( !defined $buffer ) {
209 $_ = $buffer;
210
211 # Exit at start of Feature table
212 last if /^F[HT]/;
213
214 # Description line(s)
215 if (/^DE\s+(\S.*\S)/) {
216 $desc .= $desc ? " $1" : $1;
217 }
218
219 #accession number
220 if( /^AC\s+(.*)?/ ) {
221 my @accs = split(/[; ]+/, $1); # allow space in addition
222 $params{'-accession_number'} = shift @accs
223 unless defined $params{'-accession_number'};
224 push @{$params{'-secondary_accessions'}}, @accs;
225 }
226
227 #version number
228 if( /^SV\s+\S+\.(\d+);?/ ) {
229 my $sv = $1;
230 #$sv =~ s/\;//;
231 $params{'-seq_version'} = $sv;
232 $params{'-version'} = $sv;
233 }
234
235 #date (NOTE: takes last date line)
236 if( /^DT\s+(.+)$/ ) {
237 my $date = $1;
238 push @{$params{'-dates'}}, $date;
239 }
240
241 #keywords
242 if( /^KW\s+(.*)\S*$/ ) {
243 my @kw = split(/\s*\;\s*/,$1);
244 push @{$params{'-keywords'}}, @kw;
245 }
246
247 # Organism name and phylogenetic information
248 elsif (/^O[SC]/) {
249 my $species = $self->_read_EMBL_Species(\$buffer);
250 $params{'-species'}= $species;
251 }
252
253 # References
254 elsif (/^R/) {
255 my @refs = $self->_read_EMBL_References(\$buffer);
256 foreach my $ref ( @refs ) {
257 $annotation->add_Annotation('reference',$ref);
258 }
259 }
260
261 # DB Xrefs
262 elsif (/^DR/) {
263 my @links = $self->_read_EMBL_DBLink(\$buffer);
264 foreach my $dblink ( @links ) {
265 $annotation->add_Annotation('dblink',$dblink);
266 }
267 }
268
269 # Comments
270 elsif (/^CC\s+(.*)/) {
271 $comment .= $1;
272 $comment .= " ";
273 while (defined ($_ = $self->_readline) ) {
274 if (/^CC\s+(.*)/) {
275 $comment .= $1;
276 $comment .= " ";
277 }
278 else {
279 last;
280 }
281 }
282 my $commobj = Bio::Annotation::Comment->new();
283 $commobj->text($comment);
284 $annotation->add_Annotation('comment',$commobj);
285 $comment = "";
286 }
287
288 # Get next line.
289 $buffer = $self->_readline;
290 }
291
292 while( defined ($_ = $self->_readline) ) {
293 /^FT \w/ && last;
294 /^SQ / && last;
295 /^CO / && last;
296 }
297 $buffer = $_;
298
299 if (defined($buffer) && $buffer =~ /^FT /) {
300 until( !defined ($buffer) ) {
301 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
302 # process ftunit
303
304 push(@features,
305 $ftunit->_generic_seqfeature($self->location_factory(), $name));
306
307 if( $buffer !~ /^FT/ ) {
308 last;
309 }
310 }
311 }
312
313
314 # skip comments
315 while( defined ($buffer) && $buffer =~ /^XX/ ) {
316 $buffer = $self->_readline();
317 }
318
319 if( $buffer =~ /^CO/ ) {
320 until( !defined ($buffer) ) {
321 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
322 # process ftunit
323 push(@features,
324 $ftunit->_generic_seqfeature($self->location_factory(),
325 $name));
326
327 if( $buffer !~ /^CO/ ) {
328 last;
329 }
330 }
331 }
332 if( $buffer !~ /^SQ/ ) {
333 while( defined ($_ = $self->_readline) ) {
334 /^SQ/ && last;
335 }
336 }
337
338 $seqc = "";
339 while( defined ($_ = $self->_readline) ) {
340 /^\/\// && last;
341 $_ = uc($_);
342 s/[^A-Za-z]//g;
343 $seqc .= $_;
344 }
345 my $seq = $self->sequence_factory->create
346 (-verbose => $self->verbose(),
347 -division => $div,
348 -seq => $seqc,
349 -desc => $desc,
350 -display_id => $name,
351 -annotation => $annotation,
352 -molecule => $mol,
353 -alphabet => $alphabet,
354 -features => \@features,
355 %params);
356
357 return $seq;
358 }
359
360 =head2 write_seq
361
362 Title : write_seq
363 Usage : $stream->write_seq($seq)
364 Function: writes the $seq object (must be seq) to the stream
365 Returns : 1 for success and 0 for error
366 Args : array of 1 to n Bio::SeqI objects
367
368
369 =cut
370
371 sub write_seq {
372 my ($self,@seqs) = @_;
373
374 foreach my $seq ( @seqs ) {
375 $self->throw("Attempting to write with no seq!") unless defined $seq;
376 unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
377 $self->warn("$seq is not a SeqI compliant sequence object!")
378 if $self->verbose >= 0;
379 unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
380 $self->throw("$seq is not a PrimarySeqI compliant sequence object!");
381 }
382 }
383 my $str = $seq->seq || '';
384
385 my $mol;
386 my $div;
387 my $len = $seq->length();
388
389 if ($seq->can('division') && defined $seq->division) {
390 $div = $seq->division();
391 }
392 $div ||= 'UNK';
393
394 if ($seq->can('molecule')) {
395 $mol = $seq->molecule();
396 $mol = 'RNA' if defined $mol && $mol =~ /RNA/; # no 'mRNA'
397 }
398 elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
399 my $alphabet =$seq->primary_seq->alphabet;
400 if ($alphabet eq 'dna') {
401 $mol ='DNA';
402 }
403 elsif ($alphabet eq 'rna') {
404 $mol='RNA';
405 }
406 elsif ($alphabet eq 'protein') {
407 $mol='AA';
408 }
409 }
410 $mol ||= 'XXX';
411 $mol = "circular $mol" if $seq->is_circular;
412
413 my $temp_line;
414 if( $self->_id_generation_func ) {
415 $temp_line = &{$self->_id_generation_func}($seq);
416 } else {
417 $temp_line = sprintf("%-11sstandard; $mol; $div; %d BP.", $seq->id(), $len);
418 }
419
420 $self->_print( "ID $temp_line\n","XX\n");
421
422 # Write the accession line if present
423 my( $acc );
424 {
425 if( my $func = $self->_ac_generation_func ) {
426 $acc = &{$func}($seq);
427 } elsif( $seq->isa('Bio::Seq::RichSeqI') &&
428 defined($seq->accession_number) ) {
429 $acc = $seq->accession_number;
430 $acc = join(";", $acc, $seq->get_secondary_accessions);
431 } elsif ( $seq->can('accession_number') ) {
432 $acc = $seq->accession_number;
433 }
434
435 if (defined $acc) {
436 $self->_print("AC $acc;\n",
437 "XX\n");
438 }
439 }
440
441 # Write the sv line if present
442 {
443 my( $sv );
444 if (my $func = $self->_sv_generation_func) {
445 $sv = &{$func}($seq);
446 } elsif($seq->isa('Bio::Seq::RichSeqI') &&
447 defined($seq->seq_version)) {
448 $sv = "$acc.". $seq->seq_version();
449 }
450 if (defined $sv) {
451 $self->_print( "SV $sv\n",
452 "XX\n");
453 }
454 }
455
456 # Date lines
457 my $switch=0;
458 if( $seq->can('get_dates') ) {
459 foreach my $dt ( $seq->get_dates() ) {
460 $self->_write_line_EMBL_regex("DT ","DT ",$dt,'\s+|$',80);#'
461 $switch=1;
462 }
463 if ($switch == 1) {
464 $self->_print("XX\n");
465 }
466 }
467
468 # Description lines
469 $self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+|$',80); #'
470 $self->_print( "XX\n");
471
472 # if there, write the kw line
473 {
474 my( $kw );
475 if( my $func = $self->_kw_generation_func ) {
476 $kw = &{$func}($seq);
477 } elsif( $seq->can('keywords') ) {
478 $kw = $seq->keywords;
479 if( ref($kw) =~ /ARRAY/i ) {
480 $kw = join("; ", @$kw);
481 }
482 $kw .= '.' if( defined $kw && $kw !~ /\.$/ );
483 }
484 if (defined $kw) {
485 $self->_write_line_EMBL_regex("KW ", "KW ",
486 $kw, '\s+|$', 80); #'
487 $self->_print( "XX\n");
488
489 }
490 }
491
492 # Organism lines
493
494 if ($seq->can('species') && (my $spec = $seq->species)) {
495 my($species, @class) = $spec->classification();
496 my $genus = $class[0];
497 my $OS = "$genus $species";
498 if (my $ssp = $spec->sub_species) {
499 $OS .= " $ssp";
500 }
501 if (my $common = $spec->common_name) {
502 $OS .= " ($common)";
503 }
504 $self->_print("OS $OS\n");
505 my $OC = join('; ', reverse(@class)) .'.';
506 $self->_write_line_EMBL_regex("OC ","OC ",$OC,'; |$',80); #'
507 if ($spec->organelle) {
508 $self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80); #'
509 }
510 $self->_print("XX\n");
511 }
512
513 # Reference lines
514 my $t = 1;
515 if ( $seq->can('annotation') && defined $seq->annotation ) {
516 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
517 $self->_print( "RN [$t]\n");
518
519 # Having no RP line is legal, but we need both
520 # start and end for a valid location.
521 my $start = $ref->start;
522 my $end = $ref->end;
523 if ($start and $end) {
524 $self->_print( "RP $start-$end\n");
525 } elsif ($start or $end) {
526 $self->throw("Both start and end are needed for a valid RP line. Got: start='$start' end='$end'");
527 }
528
529 if (my $med = $ref->medline) {
530 $self->_print( "RX MEDLINE; $med.\n");
531 }
532 if (my $pm = $ref->pubmed) {
533 $self->_print( "RX PUBMED; $pm.\n");
534 }
535 $self->_write_line_EMBL_regex("RA ", "RA ",
536 $ref->authors . ";",
537 '\s+|$', 80); #'
538
539 # If there is no title to the reference, it appears
540 # as a single semi-colon. All titles must end in
541 # a semi-colon.
542 my $ref_title = $ref->title || '';
543 $ref_title =~ s/[\s;]*$/;/;
544 $self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+|$', 80); #'
545 $self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+|$', 80); #'
546 if ($ref->comment) {
547 $self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+|$', 80); #'
548 }
549 $self->_print("XX\n");
550 $t++;
551 }
552
553
554 # DB Xref lines
555 if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
556 foreach my $dr (@db_xref) {
557 my $db_name = $dr->database;
558 my $prim = $dr->primary_id;
559 my $opt = $dr->optional_id || '';
560
561 my $line = "$db_name; $prim; $opt.";
562 $self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+|$', 80); #'
563 }
564 $self->_print("XX\n");
565 }
566
567 # Comment lines
568 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
569 $self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+|$', 80); #'
570 $self->_print("XX\n");
571 }
572 }
573 # "\\s\+\|\$"
574
575 ## FEATURE TABLE
576
577 $self->_print("FH Key Location/Qualifiers\n");
578 $self->_print("FH\n");
579
580 my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
581 if ($feats[0]) {
582 if( defined $self->_post_sort ) {
583 # we need to read things into an array.
584 # Process. Sort them. Print 'em
585
586 my $post_sort_func = $self->_post_sort();
587 my @fth;
588
589 foreach my $sf ( @feats ) {
590 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
591 }
592
593 @fth = sort { &$post_sort_func($a,$b) } @fth;
594
595 foreach my $fth ( @fth ) {
596 $self->_print_EMBL_FTHelper($fth);
597 }
598 } else {
599 # not post sorted. And so we can print as we get them.
600 # lower memory load...
601
602 foreach my $sf ( @feats ) {
603 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
604 foreach my $fth ( @fth ) {
605 if( $fth->key eq 'CONTIG') {
606 $self->_show_dna(0);
607 }
608 $self->_print_EMBL_FTHelper($fth);
609 }
610 }
611 }
612 }
613
614 if( $self->_show_dna() == 0 ) {
615 $self->_print( "//\n");
616 return;
617 }
618 $self->_print( "XX\n");
619
620 # finished printing features.
621
622 $str =~ tr/A-Z/a-z/;
623
624 # Count each nucleotide
625 my $alen = $str =~ tr/a/a/;
626 my $clen = $str =~ tr/c/c/;
627 my $glen = $str =~ tr/g/g/;
628 my $tlen = $str =~ tr/t/t/;
629
630 my $olen = $len - ($alen + $tlen + $clen + $glen);
631 if( $olen < 0 ) {
632 $self->warn("Weird. More atgc than bases. Problem!");
633 }
634
635 $self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n");
636
637 my $nuc = 60; # Number of nucleotides per line
638 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
639 my $out_pat = 'A11' x 6; # Pattern for packing a line
640 my $length = length($str);
641
642 # Calculate the number of nucleotides which fit on whole lines
643 my $whole = int($length / $nuc) * $nuc;
644
645 # Print the whole lines
646 my( $i );
647 for ($i = 0; $i < $whole; $i += $nuc) {
648 my $blocks = pack $out_pat,
649 unpack $whole_pat,
650 substr($str, $i, $nuc);
651 $self->_print(sprintf(" $blocks%9d\n", $i + $nuc));
652 }
653
654 # Print the last line
655 if (my $last = substr($str, $i)) {
656 my $last_len = length($last);
657 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
658 my $blocks = pack $out_pat,
659 unpack($last_pat, $last);
660 $self->_print(sprintf(" $blocks%9d\n", $length)); # Add the length to the end
661 }
662
663 $self->_print( "//\n");
664
665 $self->flush if $self->_flush_on_write && defined $self->_fh;
666 return 1;
667 }
668 }
669
670 =head2 _print_EMBL_FTHelper
671
672 Title : _print_EMBL_FTHelper
673 Usage :
674 Function: Internal function
675 Returns :
676 Args :
677
678
679 =cut
680
681 sub _print_EMBL_FTHelper {
682 my ($self,$fth,$always_quote) = @_;
683 $always_quote ||= 0;
684
685 if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
686 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
687 }
688
689
690 #$self->_print( "FH Key Location/Qualifiers\n");
691 #$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc));
692 # let
693 if( $fth->key eq 'CONTIG' ) {
694 $self->_print("XX\n");
695 $self->_write_line_EMBL_regex("CO ",
696 "CO ",$fth->loc,
697 '\,|$',80); #'
698 return;
699 }
700 $self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key),
701 "FT ",$fth->loc,
702 '\,|$',80); #'
703 foreach my $tag ( keys %{$fth->field} ) {
704 if( ! defined $fth->field->{$tag} ) { next; }
705 foreach my $value ( @{$fth->field->{$tag}} ) {
706 $value =~ s/\"/\"\"/g;
707 if ($value eq "_no_value") {
708 $self->_write_line_EMBL_regex("FT ",
709 "FT ",
710 "/$tag",'.|$',80); #'
711 }
712 elsif( $always_quote == 1 || $value !~ /^\d+$/ ) {
713 my $pat = $value =~ /\s/ ? '\s|$' : '.|$';
714 $self->_write_line_EMBL_regex("FT ",
715 "FT ",
716 "/$tag=\"$value\"",$pat,80);
717 }
718 else {
719 $self->_write_line_EMBL_regex("FT ",
720 "FT ",
721 "/$tag=$value",'.|$',80); #'
722 }
723 # $self->_print( "FT /", $tag, "=\"", $value, "\"\n");
724 }
725 }
726 }
727
728 #'
729 =head2 _read_EMBL_References
730
731 Title : _read_EMBL_References
732 Usage :
733 Function: Reads references from EMBL format. Internal function really
734 Example :
735 Returns :
736 Args :
737
738
739 =cut
740
741 sub _read_EMBL_References {
742 my ($self,$buffer) = @_;
743 my (@refs);
744
745 # assumme things are starting with RN
746
747 if( $$buffer !~ /^RN/ ) {
748 warn("Not parsing line '$$buffer' which maybe important");
749 }
750 my $b1;
751 my $b2;
752 my $title;
753 my $loc;
754 my $au;
755 my $med;
756 my $pm;
757 my $com;
758
759 while( defined ($_ = $self->_readline) ) {
760 /^R/ || last;
761 /^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
762 /^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
763 /^RX PUBMED;\s+(\d+)/ && do {$pm=$1};
764 /^RA (.*)/ && do {
765 $au = $self->_concatenate_lines($au,$1); next;
766 };
767 /^RT (.*)/ && do {
768 $title = $self->_concatenate_lines($title,$1); next;
769 };
770 /^RL (.*)/ && do {
771 $loc = $self->_concatenate_lines($loc,$1); next;
772 };
773 /^RC (.*)/ && do {
774 $com = $self->_concatenate_lines($com,$1); next;
775 };
776 }
777
778 my $ref = new Bio::Annotation::Reference;
779 $au =~ s/;\s*$//g;
780 $title =~ s/;\s*$//g;
781
782 $ref->start($b1);
783 $ref->end($b2);
784 $ref->authors($au);
785 $ref->title($title);
786 $ref->location($loc);
787 $ref->medline($med);
788 $ref->comment($com);
789 $ref->pubmed($pm);
790
791 push(@refs,$ref);
792 $$buffer = $_;
793
794 return @refs;
795 }
796
797 =head2 _read_EMBL_Species
798
799 Title : _read_EMBL_Species
800 Usage :
801 Function: Reads the EMBL Organism species and classification
802 lines.
803 Example :
804 Returns : A Bio::Species object
805 Args : a reference to the current line buffer
806
807 =cut
808
809 sub _read_EMBL_Species {
810 my( $self, $buffer ) = @_;
811 my $org;
812
813 $_ = $$buffer;
814 my( $sub_species, $species, $genus, $common, @class );
815 while (defined( $_ ||= $self->_readline )) {
816
817 if (/^OS\s+(\S+)(?:\s+([^\(]\S*))?(?:\s+([^\(]\S*))?(?:\s+\((.*)\))?/) {
818 $genus = $1;
819 $species = $2 || 'sp.';
820 $sub_species = $3 if $3;
821 $common = $4 if $4;
822 }
823 elsif (s/^OC\s+//) {
824 # only split on ';' or '.' so that
825 # classification that is 2 words will
826 # still get matched
827 # use map to remove trailing/leading spaces
828 chomp;
829 push(@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/);
830 }
831 elsif (/^OG\s+(.*)/) {
832 $org = $1;
833 }
834 else {
835 last;
836 }
837
838 $_ = undef; # Empty $_ to trigger read of next line
839 }
840
841 $$buffer = $_;
842
843 # Don't make a species object if it is "Unknown" or "None"
844 return if $genus =~ /^(Unknown|None)$/i;
845
846 # Bio::Species array needs array in Species -> Kingdom direction
847 if ($class[$#class] eq $genus) {
848 push( @class, $species );
849 } else {
850 push( @class, $genus, $species );
851 }
852 @class = reverse @class;
853
854 my $make = Bio::Species->new();
855 $make->classification( \@class, "FORCE" ); # no name validation please
856 $make->common_name( $common ) if $common;
857 $make->sub_species( $sub_species ) if $sub_species;
858 $make->organelle ( $org ) if $org;
859 return $make;
860 }
861
862 =head2 _read_EMBL_DBLink
863
864 Title : _read_EMBL_DBLink
865 Usage :
866 Function: Reads the EMBL database cross reference ("DR") lines
867 Example :
868 Returns : A list of Bio::Annotation::DBLink objects
869 Args :
870
871 =cut
872
873 sub _read_EMBL_DBLink {
874 my( $self,$buffer ) = @_;
875 my( @db_link );
876
877 $_ = $$buffer;
878 while (defined( $_ ||= $self->_readline )) {
879
880 if (my($databse, $prim_id, $sec_id)
881 = /^DR ([^\s;]+);\s*([^\s;]+);\s*([^\s;]+)?\.$/) {
882 my $link = Bio::Annotation::DBLink->new();
883 $link->database ( $databse );
884 $link->primary_id ( $prim_id );
885 $link->optional_id( $sec_id ) if $sec_id;
886 push(@db_link, $link);
887 }
888 else {
889 last;
890 }
891
892 $_ = undef; # Empty $_ to trigger read of next line
893 }
894
895 $$buffer = $_;
896
897 return @db_link;
898 }
899
900 =head2 _filehandle
901
902 Title : _filehandle
903 Usage : $obj->_filehandle($newval)
904 Function:
905 Example :
906 Returns : value of _filehandle
907 Args : newvalue (optional)
908
909
910 =cut
911
912 sub _filehandle{
913 my ($obj,$value) = @_;
914 if( defined $value) {
915 $obj->{'_filehandle'} = $value;
916 }
917 return $obj->{'_filehandle'};
918
919 }
920
921 =head2 _read_FTHelper_EMBL
922
923 Title : _read_FTHelper_EMBL
924 Usage : _read_FTHelper_EMBL($buffer)
925 Function: reads the next FT key line
926 Example :
927 Returns : Bio::SeqIO::FTHelper object
928 Args : filehandle and reference to a scalar
929
930
931 =cut
932
933 sub _read_FTHelper_EMBL {
934 my ($self,$buffer) = @_;
935
936 my ($key, # The key of the feature
937 $loc, # The location line from the feature
938 @qual, # An arrray of lines making up the qualifiers
939 );
940
941 if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/) {
942 $key = $1;
943 $loc = $2;
944 # Read all the lines up to the next feature
945 while ( defined($_ = $self->_readline) ) {
946 if (/^FT(\s+)(.+?)\s*$/) {
947 # Lines inside features are preceeded by 19 spaces
948 # A new feature is preceeded by 3 spaces
949 if (length($1) > 4) {
950 # Add to qualifiers if we're in the qualifiers
951 if (@qual) {
952 push(@qual, $2);
953 }
954 # Start the qualifier list if it's the first qualifier
955 elsif (substr($2, 0, 1) eq '/') {
956 @qual = ($2);
957 }
958 # We're still in the location line, so append to location
959 else {
960 $loc .= $2;
961 }
962 } else {
963 # We've reached the start of the next feature
964 last;
965 }
966 } else {
967 # We're at the end of the feature table
968 last;
969 }
970 }
971 } elsif( $$buffer =~ /^CO\s+(\S+)/) {
972 $key = 'CONTIG';
973 $loc = $1;
974 # Read all the lines up to the next feature
975 while ( defined($_ = $self->_readline) ) {
976 if (/^CO\s+(\S+)\s*$/) {
977 $loc .= $1;
978 } else {
979 # We've reached the start of the next feature
980 last;
981 }
982 }
983 } else {
984 # No feature key
985 return;
986 }
987
988 # Put the first line of the next feature into the buffer
989 $$buffer = $_;
990
991 # Make the new FTHelper object
992 my $out = new Bio::SeqIO::FTHelper();
993 $out->verbose($self->verbose());
994 $out->key($key);
995 $out->loc($loc);
996
997 # Now parse and add any qualifiers. (@qual is kept
998 # intact to provide informative error messages.)
999 QUAL: for (my $i = 0; $i < @qual; $i++) {
1000 $_ = $qual[$i];
1001 my( $qualifier, $value ) = m{^/([^=]+)(?:=(.+))?}
1002 or $self->throw("Can't see new qualifier in: $_\nfrom:\n"
1003 . join('', map "$_\n", @qual));
1004 if (defined $value) {
1005 # Do we have a quoted value?
1006 if (substr($value, 0, 1) eq '"') {
1007 # Keep adding to value until we find the trailing quote
1008 # and the quotes are balanced
1009 while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1010 $i++;
1011 my $next = $qual[$i];
1012 unless (defined($next)) {
1013 warn("Unbalanced quote in:\n", map("$_\n", @qual),
1014 "No further qualifiers will be added for this feature");
1015 last QUAL;
1016 }
1017
1018 # Join to value with space if value or next line contains a space
1019 $value .= (grep /\s/, ($value, $next)) ? " $next" : $next;
1020 }
1021 # Trim leading and trailing quotes
1022 $value =~ s/^"|"$//g;
1023 # Undouble internal quotes
1024 $value =~ s/""/"/g; #"
1025 }
1026 } else {
1027 $value = '_no_value';
1028 }
1029
1030 # Store the qualifier
1031 $out->field->{$qualifier} ||= [];
1032 push(@{$out->field->{$qualifier}},$value);
1033 }
1034
1035 return $out;
1036 }
1037
1038 =head2 _write_line_EMBL
1039
1040 Title : _write_line_EMBL
1041 Usage :
1042 Function: internal function
1043 Example :
1044 Returns :
1045 Args :
1046
1047
1048 =cut
1049
1050 sub _write_line_EMBL {
1051 my ($self,$pre1,$pre2,$line,$length) = @_;
1052
1053 $length || die "Miscalled write_line_EMBL without length. Programming error!";
1054 my $subl = $length - length $pre2;
1055 my $linel = length $line;
1056 my $i;
1057
1058 my $sub = substr($line,0,$length - length $pre1);
1059
1060 $self->_print( "$pre1$sub\n");
1061
1062 for($i= ($length - length $pre1);$i < $linel;) {
1063 $sub = substr($line,$i,($subl));
1064 $self->_print( "$pre2$sub\n");
1065 $i += $subl;
1066 }
1067
1068 }
1069
1070 =head2 _write_line_EMBL_regex
1071
1072 Title : _write_line_EMBL_regex
1073 Usage :
1074 Function: internal function for writing lines of specified
1075 length, with different first and the next line
1076 left hand headers and split at specific points in the
1077 text
1078 Example :
1079 Returns : nothing
1080 Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1081
1082
1083 =cut
1084
1085 sub _write_line_EMBL_regex {
1086 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1087
1088 #print STDOUT "Going to print with $line!\n";
1089
1090 $length || die "Programming error - called write_line_EMBL_regex without length.";
1091
1092 my $subl = $length - (length $pre1) -1 ;
1093
1094
1095
1096 my( @lines );
1097 while(defined $line &&
1098 $line =~ m/(.{1,$subl})($regex)/g) {
1099 push(@lines, $1.$2);
1100 }
1101 foreach (@lines) { s/\s+$//; }
1102
1103 # Print first line
1104 my $s = shift(@lines) || '';
1105 $self->_print( "$pre1$s\n");
1106
1107 # Print the rest
1108 foreach my $s ( @lines ) {
1109 $s = '' if( !defined $s );
1110 $self->_print( "$pre2$s\n");
1111 }
1112 }
1113
1114 =head2 _post_sort
1115
1116 Title : _post_sort
1117 Usage : $obj->_post_sort($newval)
1118 Function:
1119 Returns : value of _post_sort
1120 Args : newvalue (optional)
1121
1122
1123 =cut
1124
1125 sub _post_sort{
1126 my $obj = shift;
1127 if( @_ ) {
1128 my $value = shift;
1129 $obj->{'_post_sort'} = $value;
1130 }
1131 return $obj->{'_post_sort'};
1132
1133 }
1134
1135 =head2 _show_dna
1136
1137 Title : _show_dna
1138 Usage : $obj->_show_dna($newval)
1139 Function:
1140 Returns : value of _show_dna
1141 Args : newvalue (optional)
1142
1143
1144 =cut
1145
1146 sub _show_dna{
1147 my $obj = shift;
1148 if( @_ ) {
1149 my $value = shift;
1150 $obj->{'_show_dna'} = $value;
1151 }
1152 return $obj->{'_show_dna'};
1153
1154 }
1155
1156 =head2 _id_generation_func
1157
1158 Title : _id_generation_func
1159 Usage : $obj->_id_generation_func($newval)
1160 Function:
1161 Returns : value of _id_generation_func
1162 Args : newvalue (optional)
1163
1164
1165 =cut
1166
1167 sub _id_generation_func{
1168 my $obj = shift;
1169 if( @_ ) {
1170 my $value = shift;
1171 $obj->{'_id_generation_func'} = $value;
1172 }
1173 return $obj->{'_id_generation_func'};
1174
1175 }
1176
1177 =head2 _ac_generation_func
1178
1179 Title : _ac_generation_func
1180 Usage : $obj->_ac_generation_func($newval)
1181 Function:
1182 Returns : value of _ac_generation_func
1183 Args : newvalue (optional)
1184
1185
1186 =cut
1187
1188 sub _ac_generation_func{
1189 my $obj = shift;
1190 if( @_ ) {
1191 my $value = shift;
1192 $obj->{'_ac_generation_func'} = $value;
1193 }
1194 return $obj->{'_ac_generation_func'};
1195
1196 }
1197
1198 =head2 _sv_generation_func
1199
1200 Title : _sv_generation_func
1201 Usage : $obj->_sv_generation_func($newval)
1202 Function:
1203 Returns : value of _sv_generation_func
1204 Args : newvalue (optional)
1205
1206
1207 =cut
1208
1209 sub _sv_generation_func{
1210 my $obj = shift;
1211 if( @_ ) {
1212 my $value = shift;
1213 $obj->{'_sv_generation_func'} = $value;
1214 }
1215 return $obj->{'_sv_generation_func'};
1216
1217 }
1218
1219 =head2 _kw_generation_func
1220
1221 Title : _kw_generation_func
1222 Usage : $obj->_kw_generation_func($newval)
1223 Function:
1224 Returns : value of _kw_generation_func
1225 Args : newvalue (optional)
1226
1227
1228 =cut
1229
1230 sub _kw_generation_func{
1231 my $obj = shift;
1232 if( @_ ) {
1233 my $value = shift;
1234 $obj->{'_kw_generation_func'} = $value;
1235 }
1236 return $obj->{'_kw_generation_func'};
1237
1238 }
1239
1240 1;