0
|
1 # $Id: genbank.pm,v 1.76.2.12 2003/09/13 23:33:04 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::SeqIO::GenBank
|
|
4 #
|
|
5 # Cared for by Elia Stupka <elia@tll.org.sg>
|
|
6 #
|
|
7 # Copyright Elia Stupka
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SeqIO::GenBank - GenBank sequence input/output stream
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 It is probably best not to use this object directly, but
|
|
20 rather go through the SeqIO handler system. Go:
|
|
21
|
|
22 $stream = Bio::SeqIO->new(-file => $filename, -format => 'GenBank');
|
|
23
|
|
24 while ( my $seq = $stream->next_seq() ) {
|
|
25 # do something with $seq
|
|
26 }
|
|
27
|
|
28
|
|
29 =head1 DESCRIPTION
|
|
30
|
|
31 This object can transform Bio::Seq objects to and from GenBank flat
|
|
32 file databases.
|
|
33
|
|
34 There is alot of flexibility here about how to dump things which I need
|
|
35 to document fully.
|
|
36
|
|
37 =head2 Mapping of record properties to object properties
|
|
38
|
|
39 This section is supposed to document which sections and properties of
|
|
40 a GenBank databank record end up where in the Bioperl object model. It
|
|
41 is far from complete and presently focuses only on those mappings
|
|
42 which may be non-obvious. $seq in the text refers to the
|
|
43 Bio::Seq::RichSeqI implementing object returned by the parser for each
|
|
44 record.
|
|
45
|
|
46 =over 4
|
|
47
|
|
48 =item GI number
|
|
49
|
|
50 $seq-E<gt>primary_id
|
|
51
|
|
52 =back
|
|
53
|
|
54 =head2 Optional functions
|
|
55
|
|
56 =over 3
|
|
57
|
|
58 =item _show_dna()
|
|
59
|
|
60 (output only) shows the dna or not
|
|
61
|
|
62 =item _post_sort()
|
|
63
|
|
64 (output only) provides a sorting func which is applied to the FTHelpers
|
|
65 before printing
|
|
66
|
|
67 =item _id_generation_func()
|
|
68
|
|
69 This is function which is called as
|
|
70
|
|
71 print "ID ", $func($seq), "\n";
|
|
72
|
|
73 To generate the ID line. If it is not there, it generates a sensible ID
|
|
74 line using a number of tools.
|
|
75
|
|
76
|
|
77 If you want to output annotations in genbank format they need to be
|
|
78 stored in a Bio::Annotation::Collection object which is accessible
|
|
79 through the Bio::SeqI interface method L<annotation()|annotation>.
|
|
80
|
|
81 The following are the names of the keys which are polled from a
|
|
82 L<Bio::Annotation::Collection> object.
|
|
83
|
|
84 reference - Should contain Bio::Annotation::Reference objects
|
|
85 comment - Should contain Bio::Annotation::Comment objects
|
|
86
|
|
87 segment - Should contain a Bio::Annotation::SimpleValue object
|
|
88 origin - Should contain a Bio::Annotation::SimpleValue object
|
|
89
|
|
90 =back
|
|
91
|
|
92 =head1 Where does the data go?
|
|
93
|
|
94 Data parsed in Bio::SeqIO::genbank is stored in a variety of data
|
|
95 fields in the sequence object that is returned. More information in
|
|
96 the HOWTOs about exactly what each field means and where it goes.
|
|
97 Here is a partial list of fields.
|
|
98
|
|
99 Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you
|
|
100 the top level object which defines a function called NAME() which
|
|
101 stores this information.
|
|
102
|
|
103 Items listed as Annotation 'NAME' tell you the data is stored the
|
|
104 associated Bio::Annotation::Colection object which is associated with
|
|
105 Bio::Seq objects. If it is explictly requested that no annotations
|
|
106 should be stored when parsing a record of course they won't be
|
|
107 available when you try and get them. If you are having this problem
|
|
108 look at the type of SeqBuilder that is being used to contruct your
|
|
109 sequence object.
|
|
110
|
|
111 Comments Annotation 'comment'
|
|
112 References Annotation 'reference'
|
|
113 Segment Annotation 'segment'
|
|
114 Origin Annotation 'origin'
|
|
115
|
|
116 Accessions PrimarySeq accession_number()
|
|
117 Secondary accessions RichSeq get_secondary_accessions()
|
|
118 Keywords RichSeq keywords()
|
|
119 Dates RichSeq get_dates()
|
|
120 Molecule RichSeq molecule()
|
|
121 Seq Version RichSeq seq_version()
|
|
122 PID RichSeq pid()
|
|
123 Division RichSeq division()
|
|
124 Features Seq get_SeqFeatures()
|
|
125 Alphabet PrimarySeq alphabet()
|
|
126 Definition PrimarySeq description() or desc()
|
|
127 Version PrimarySeq version()
|
|
128
|
|
129 Sequence PrimarySeq seq()
|
|
130
|
|
131 =head1 FEEDBACK
|
|
132
|
|
133 =head2 Mailing Lists
|
|
134
|
|
135 User feedback is an integral part of the evolution of this
|
|
136 and other Bioperl modules. Send your comments and suggestions preferably
|
|
137 to one of the Bioperl mailing lists.
|
|
138 Your participation is much appreciated.
|
|
139
|
|
140 bioperl-l@bioperl.org - General discussion
|
|
141 http://www.bioperl.org/MailList.shtml - About the mailing lists
|
|
142
|
|
143 =head2 Reporting Bugs
|
|
144
|
|
145 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
146 the bugs and their resolution.
|
|
147 Bug reports can be submitted via email or the web:
|
|
148
|
|
149 bioperl-bugs@bio.perl.org
|
|
150 http://bugzilla.bioperl.org/
|
|
151
|
|
152 =head1 AUTHOR - Elia Stupka
|
|
153
|
|
154 Email elia@tll.org.sg
|
|
155
|
|
156 =head1 CONTRIBUTORS
|
|
157
|
|
158 Ewan Birney birney@ebi.ac.uk
|
|
159 Jason Stajich jason@bioperl.org
|
|
160 Chris Mungall cjm@fruitfly.bdgp.berkeley.edu
|
|
161 Lincoln Stein lstein@cshl.org
|
|
162 Heikki Lehvaslaiho, heikki@ebi.ac.uk
|
|
163 Hilmar Lapp, hlapp@gmx.net
|
|
164 Donald G. Jackson, donald.jackson@bms.com
|
|
165
|
|
166 =head1 APPENDIX
|
|
167
|
|
168 The rest of the documentation details each of the object
|
|
169 methods. Internal methods are usually preceded with a _
|
|
170
|
|
171 =cut
|
|
172
|
|
173 # Let the code begin...
|
|
174
|
|
175 package Bio::SeqIO::genbank;
|
|
176 use vars qw(@ISA);
|
|
177 use strict;
|
|
178
|
|
179 use Bio::SeqIO;
|
|
180 use Bio::SeqIO::FTHelper;
|
|
181 use Bio::SeqFeature::Generic;
|
|
182 use Bio::Species;
|
|
183 use Bio::Seq::SeqFactory;
|
|
184 use Bio::Annotation::Collection;
|
|
185 use Bio::Annotation::Comment;
|
|
186 use Bio::Annotation::Reference;
|
|
187 use Bio::Annotation::DBLink;
|
|
188
|
|
189 @ISA = qw(Bio::SeqIO);
|
|
190
|
|
191 sub _initialize {
|
|
192 my($self,@args) = @_;
|
|
193
|
|
194 $self->SUPER::_initialize(@args);
|
|
195 # hash for functions for decoding keys.
|
|
196 $self->{'_func_ftunit_hash'} = {};
|
|
197 $self->_show_dna(1); # sets this to one by default. People can change it
|
|
198 if( ! defined $self->sequence_factory ) {
|
|
199 $self->sequence_factory(new Bio::Seq::SeqFactory
|
|
200 (-verbose => $self->verbose(),
|
|
201 -type => 'Bio::Seq::RichSeq'));
|
|
202 }
|
|
203 }
|
|
204
|
|
205 =head2 next_seq
|
|
206
|
|
207 Title : next_seq
|
|
208 Usage : $seq = $stream->next_seq()
|
|
209 Function: returns the next sequence in the stream
|
|
210 Returns : Bio::Seq object
|
|
211 Args :
|
|
212
|
|
213 =cut
|
|
214
|
|
215 sub next_seq {
|
|
216 my ($self,@args) = @_;
|
|
217 my $builder = $self->sequence_builder();
|
|
218 my $seq;
|
|
219 my %params;
|
|
220
|
|
221 RECORDSTART: while (1) {
|
|
222 my $buffer;
|
|
223 my (@acc, @features);
|
|
224 my ($display_id, $annotation);
|
|
225 my $species;
|
|
226
|
|
227 # initialize; we may come here because of starting over
|
|
228 @features = ();
|
|
229 $annotation = undef;
|
|
230 @acc = ();
|
|
231 $species = undef;
|
|
232 %params = (-verbose => $self->verbose); # reset hash
|
|
233 local($/) = "\n";
|
|
234 while(defined($buffer = $self->_readline())) {
|
|
235 last if index($buffer,'LOCUS ') == 0;
|
|
236 }
|
|
237 return undef if( !defined $buffer ); # end of file
|
|
238 $buffer =~ /^LOCUS\s+(\S.*)$/ ||
|
|
239 $self->throw("GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'");
|
|
240
|
|
241 my @tokens = split(' ', $1);
|
|
242
|
|
243 # this is important to have the id for display in e.g. FTHelper,
|
|
244 # otherwise you won't know which entry caused an error
|
|
245 $display_id = shift(@tokens);
|
|
246 $params{'-display_id'} = $display_id;
|
|
247 # may still be useful if we don't want the seq
|
|
248 $params{'-length'} = shift(@tokens);
|
|
249 # the alphabet of the entry
|
|
250 $params{'-alphabet'} = (lc(shift @tokens) eq 'bp') ? 'dna' : 'protein';
|
|
251 # for aa there is usually no 'molecule' (mRNA etc)
|
|
252 if (($params{'-alphabet'} eq 'dna') || (@tokens > 2)) {
|
|
253 $params{'-molecule'} = shift(@tokens);
|
|
254 my $circ = shift(@tokens);
|
|
255 if ($circ eq 'circular') {
|
|
256 $params{'-is_circular'} = 1;
|
|
257 $params{'-division'} = shift(@tokens);
|
|
258 } else {
|
|
259 # 'linear' or 'circular' may actually be omitted altogether
|
|
260 $params{'-division'} =
|
|
261 (CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
|
|
262 }
|
|
263 } else {
|
|
264 $params{'-molecule'} = 'PRT' if($params{'-alphabet'} eq 'aa');
|
|
265 $params{'-division'} = shift(@tokens);
|
|
266 }
|
|
267 my $date = join(' ', @tokens); # we lump together the rest
|
|
268 # this is per request bug #1513
|
|
269 # we can handle
|
|
270 # 9-10-2003
|
|
271 # 9-10-03
|
|
272 #09-10-2003
|
|
273 #09-10-03
|
|
274 if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) {
|
|
275 if( length($date) < 11 ) { # improperly formatted date
|
|
276 # But we'll be nice and fix it for them
|
|
277 my ($d,$m,$y) = ($2,$3,$4);
|
|
278 if( length($d) == 1 ) {
|
|
279 $d = "0$d";
|
|
280 }
|
|
281 # guess the century here
|
|
282 if( length($y) == 2 ) {
|
|
283 if( $y > 60 ) { # arbitrarily guess that '60' means 1960
|
|
284 $y = "19$y";
|
|
285 } else {
|
|
286 $y = "20$y";
|
|
287 }
|
|
288 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
|
|
289 }
|
|
290 $params{'-dates'} = [join('-',$d,$m,$y)];
|
|
291 } else {
|
|
292 $params{'-dates'} = [$date];
|
|
293 }
|
|
294 }
|
|
295
|
|
296 # set them all at once
|
|
297 $builder->add_slot_value(%params);
|
|
298 %params = ();
|
|
299
|
|
300 # parse the rest if desired, otherwise start over
|
|
301 if(! $builder->want_object()) {
|
|
302 $builder->make_object();
|
|
303 next RECORDSTART;
|
|
304 }
|
|
305
|
|
306 # set up annotation depending on what the builder wants
|
|
307 if($builder->want_slot('annotation')) {
|
|
308 $annotation = new Bio::Annotation::Collection;
|
|
309 }
|
|
310 $buffer = $self->_readline();
|
|
311 until( !defined ($buffer) ) {
|
|
312 $_ = $buffer;
|
|
313
|
|
314 # Description line(s)
|
|
315 if (/^DEFINITION\s+(\S.*\S)/) {
|
|
316 my @desc = ($1);
|
|
317 while ( defined($_ = $self->_readline) ) {
|
|
318 if( /^\s+(.*)/ ) { push (@desc, $1); next };
|
|
319 last;
|
|
320 }
|
|
321 $builder->add_slot_value(-desc => join(' ', @desc));
|
|
322 # we'll continue right here because DEFINITION always comes
|
|
323 # at the top of the entry
|
|
324 }
|
|
325 # accession number (there can be multiple accessions)
|
|
326 if( /^ACCESSION\s+(\S.*\S)/ ) {
|
|
327 push(@acc, split(/\s+/,$1));
|
|
328 while( defined($_ = $self->_readline) ) {
|
|
329 /^\s+(.*)/ && do { push (@acc, split(/\s+/,$1)); next };
|
|
330 last;
|
|
331 }
|
|
332 $buffer = $_;
|
|
333 next;
|
|
334 }
|
|
335 # PID
|
|
336 elsif( /^PID\s+(\S+)/ ) {
|
|
337 $params{'-pid'} = $1;
|
|
338 }
|
|
339 #Version number
|
|
340 elsif( /^VERSION\s+(.+)$/ ) {
|
|
341 my ($acc,$gi) = split(' ',$1);
|
|
342 if($acc =~ /^\w+\.(\d+)/) {
|
|
343 $params{'-version'} = $1;
|
|
344 $params{'-seq_version'} = $1;
|
|
345 }
|
|
346 if($gi && (index($gi,"GI:") == 0)) {
|
|
347 $params{'-primary_id'} = substr($gi,3);
|
|
348 }
|
|
349 }
|
|
350 #Keywords
|
|
351 elsif( /^KEYWORDS\s+(.*)/ ) {
|
|
352 my @kw = split(/\s*\;\s*/,$1);
|
|
353 while( defined($_ = $self->_readline) ) {
|
|
354 chomp;
|
|
355 /^\s+(.*)/ && do { push (@kw, split(/\s*\;\s*/,$1)); next };
|
|
356 last;
|
|
357 }
|
|
358
|
|
359 @kw && $kw[-1] =~ s/\.$//;
|
|
360 $params{'-keywords'} = \@kw;
|
|
361 $buffer = $_;
|
|
362 next;
|
|
363 }
|
|
364 # Organism name and phylogenetic information
|
|
365 elsif (/^SOURCE/) {
|
|
366 if($builder->want_slot('species')) {
|
|
367 $species = $self->_read_GenBank_Species(\$buffer);
|
|
368 $builder->add_slot_value(-species => $species);
|
|
369 } else {
|
|
370 while(defined($buffer = $self->_readline())) {
|
|
371 last if substr($buffer,0,1) ne ' ';
|
|
372 }
|
|
373 }
|
|
374 next;
|
|
375 }
|
|
376 #References
|
|
377 elsif (/^REFERENCE/) {
|
|
378 if($annotation) {
|
|
379 my @refs = $self->_read_GenBank_References(\$buffer);
|
|
380 foreach my $ref ( @refs ) {
|
|
381 $annotation->add_Annotation('reference',$ref);
|
|
382 }
|
|
383 } else {
|
|
384 while(defined($buffer = $self->_readline())) {
|
|
385 last if substr($buffer,0,1) ne ' ';
|
|
386 }
|
|
387 }
|
|
388 next;
|
|
389 }
|
|
390 #Comments
|
|
391 elsif (/^COMMENT\s+(.*)/) {
|
|
392 if($annotation) {
|
|
393 my $comment = $1;
|
|
394 while (defined($_ = $self->_readline)) {
|
|
395 last if (/^\S/);
|
|
396 $comment .= $_;
|
|
397 }
|
|
398 $comment =~ s/\n/ /g;
|
|
399 $comment =~ s/ +/ /g;
|
|
400 $annotation->add_Annotation(
|
|
401 'comment',
|
|
402 Bio::Annotation::Comment->new(-text => $comment));
|
|
403 $buffer = $_;
|
|
404 } else {
|
|
405 while(defined($buffer = $self->_readline())) {
|
|
406 last if substr($buffer,0,1) ne ' ';
|
|
407 }
|
|
408 }
|
|
409 next;
|
|
410 } elsif( /^SEGMENT\s+(.+)/ ) {
|
|
411 if($annotation) {
|
|
412 my $segment = $1;
|
|
413 while (defined($_ = $self->_readline)) {
|
|
414 last if (/^\S/);
|
|
415 $segment .= $_;
|
|
416 }
|
|
417 $segment =~ s/\n/ /g;
|
|
418 $segment =~ s/ +/ /g;
|
|
419 $annotation->add_Annotation(
|
|
420 'segment',
|
|
421 Bio::Annotation::SimpleValue->new(-value => $segment));
|
|
422 $buffer = $_;
|
|
423 } else {
|
|
424 while(defined($buffer = $self->_readline())) {
|
|
425 last if substr($buffer,0,1) ne ' ';
|
|
426 }
|
|
427 }
|
|
428 next;
|
|
429 }
|
|
430
|
|
431 # Exit at start of Feature table, or start of sequence
|
|
432 last if( /^(FEATURES|ORIGIN)/ );
|
|
433 # Get next line and loop again
|
|
434 $buffer = $self->_readline;
|
|
435 }
|
|
436 return undef if(! defined($buffer));
|
|
437
|
|
438 # add them all at once for efficiency
|
|
439 $builder->add_slot_value(-accession_number => shift(@acc),
|
|
440 -secondary_accessions => \@acc,
|
|
441 %params);
|
|
442 $builder->add_slot_value(-annotation => $annotation) if $annotation;
|
|
443 %params = (); # reset before possible re-use to avoid setting twice
|
|
444
|
|
445 # start over if we don't want to continue with this entry
|
|
446 if(! $builder->want_object()) {
|
|
447 $builder->make_object();
|
|
448 next RECORDSTART;
|
|
449 }
|
|
450
|
|
451 # some "minimal" formats may not necessarily have a feature table
|
|
452 if($builder->want_slot('features') && defined($_) && /^FEATURES/) {
|
|
453 # need to read the first line of the feature table
|
|
454 $buffer = $self->_readline;
|
|
455
|
|
456 # DO NOT read lines in the while condition -- this is done as a side
|
|
457 # effect in _read_FTHelper_GenBank!
|
|
458 while( defined($buffer) ) {
|
|
459 # check immediately -- not at the end of the loop
|
|
460 # note: GenPept entries obviously do not have a BASE line
|
|
461 last if(($buffer =~ /^BASE/) || ($buffer =~ /^ORIGIN/) ||
|
|
462 ($buffer =~ /^CONTIG/) );
|
|
463
|
|
464 # slurp in one feature at a time -- at return, the start of
|
|
465 # the next feature will have been read already, so we need
|
|
466 # to pass a reference, and the called method must set this
|
|
467 # to the last line read before returning
|
|
468
|
|
469 my $ftunit = $self->_read_FTHelper_GenBank(\$buffer);
|
|
470
|
|
471 # fix suggested by James Diggans
|
|
472
|
|
473 if( !defined $ftunit ) {
|
|
474 # GRRRR. We have fallen over. Try to recover
|
|
475 $self->warn("Unexpected error in feature table for ".$params{'-display_id'}." Skipping feature, attempting to recover");
|
|
476 unless( ($buffer =~ /^\s{5,5}\S+/) or ($buffer =~ /^\S+/)) {
|
|
477 $buffer = $self->_readline();
|
|
478 }
|
|
479 next; # back to reading FTHelpers
|
|
480 }
|
|
481
|
|
482 # process ftunit
|
|
483 my $feat =
|
|
484 $ftunit->_generic_seqfeature($self->location_factory(),
|
|
485 $display_id);
|
|
486 # add taxon_id from source if available
|
|
487 if($species && ($feat->primary_tag eq 'source') &&
|
|
488 $feat->has_tag('db_xref') && (! $species->ncbi_taxid())) {
|
|
489 foreach my $tagval ($feat->get_tag_values('db_xref')) {
|
|
490 if(index($tagval,"taxon:") == 0) {
|
|
491 $species->ncbi_taxid(substr($tagval,6));
|
|
492 }
|
|
493 }
|
|
494 }
|
|
495 # add feature to list of features
|
|
496 push(@features, $feat);
|
|
497 }
|
|
498 $builder->add_slot_value(-features => \@features);
|
|
499 $_ = $buffer;
|
|
500 }
|
|
501 if( defined ($_) ) {
|
|
502 if( /^CONTIG/ && $builder->want_slot('features')) {
|
|
503 $b = " $_"; # need 5 spaces to treat it like a feature
|
|
504 my $ftunit = $self->_read_FTHelper_GenBank(\$b);
|
|
505 if( ! defined $ftunit ) {
|
|
506 $self->warn("unable to parse the CONTIG feature\n");
|
|
507 } else {
|
|
508 push(@features,
|
|
509 $ftunit->_generic_seqfeature($self->location_factory(),
|
|
510 $display_id));
|
|
511 }
|
|
512 } elsif(! /^(ORIGIN|\/\/)/ ) { # advance to the sequence, if any
|
|
513 while (defined( $_ = $self->_readline) ) {
|
|
514 last if /^(ORIGIN|\/\/)/;
|
|
515 }
|
|
516 }
|
|
517 }
|
|
518 if(! $builder->want_object()) {
|
|
519 $builder->make_object(); # implicit end-of-object
|
|
520 next RECORDSTART;
|
|
521 }
|
|
522 if($builder->want_slot('seq')) {
|
|
523 # the fact that we want a sequence does not necessarily mean that
|
|
524 # there also is a sequence ...
|
|
525 if(defined($_) && s/^ORIGIN//) {
|
|
526 chomp;
|
|
527 if( $annotation && length($_) > 0 ) {
|
|
528 $annotation->add_Annotation('origin',
|
|
529 Bio::Annotation::SimpleValue->new(-value => $_));
|
|
530 }
|
|
531 my $seqc = '';
|
|
532 while( defined($_ = $self->_readline) ) {
|
|
533 /^\/\// && last;
|
|
534 $_ = uc($_);
|
|
535 s/[^A-Za-z]//g;
|
|
536 $seqc .= $_;
|
|
537 }
|
|
538 $self->debug("sequence length is ". length($seqc) ."\n");
|
|
539 $builder->add_slot_value(-seq => $seqc);
|
|
540 }
|
|
541 } elsif ( defined($_) && (substr($_,0,2) ne '//')) {
|
|
542 # advance to the end of the record
|
|
543 while( defined($_ = $self->_readline) ) {
|
|
544 last if substr($_,0,2) eq '//';
|
|
545 }
|
|
546 }
|
|
547 # Unlikely, but maybe the sequence is so weird that we don't want it
|
|
548 # anymore. We don't want to return undef if the stream's not exhausted
|
|
549 # yet.
|
|
550 $seq = $builder->make_object();
|
|
551 next RECORDSTART unless $seq;
|
|
552 last RECORDSTART;
|
|
553 } # end while RECORDSTART
|
|
554
|
|
555 return $seq;
|
|
556 }
|
|
557
|
|
558 =head2 write_seq
|
|
559
|
|
560 Title : write_seq
|
|
561 Usage : $stream->write_seq($seq)
|
|
562 Function: writes the $seq object (must be seq) to the stream
|
|
563 Returns : 1 for success and 0 for error
|
|
564 Args : array of 1 to n Bio::SeqI objects
|
|
565
|
|
566
|
|
567 =cut
|
|
568
|
|
569 sub write_seq {
|
|
570 my ($self,@seqs) = @_;
|
|
571
|
|
572 foreach my $seq ( @seqs ) {
|
|
573 $self->throw("Attempting to write with no seq!") unless defined $seq;
|
|
574
|
|
575 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
|
|
576 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
|
|
577 }
|
|
578
|
|
579 my $str = $seq->seq;
|
|
580
|
|
581 my ($div, $mol);
|
|
582 my $len = $seq->length();
|
|
583
|
|
584 if ( $seq->can('division') ) {
|
|
585 $div=$seq->division;
|
|
586 }
|
|
587 if( !defined $div || ! $div ) { $div = 'UNK'; }
|
|
588 my $alpha = $seq->alphabet;
|
|
589 if( !$seq->can('molecule') || ! defined ($mol = $seq->molecule()) ) {
|
|
590 $mol = $alpha || 'DNA';
|
|
591 }
|
|
592
|
|
593 my $circular = 'linear ';
|
|
594 $circular = 'circular' if $seq->is_circular;
|
|
595
|
|
596 local($^W) = 0; # supressing warnings about uninitialized fields.
|
|
597
|
|
598 my $temp_line;
|
|
599 if( $self->_id_generation_func ) {
|
|
600 $temp_line = &{$self->_id_generation_func}($seq);
|
|
601 } else {
|
|
602 my $date = '';
|
|
603 if( $seq->can('get_dates') ) {
|
|
604 ($date) = $seq->get_dates();
|
|
605 }
|
|
606 $temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s",
|
|
607 'LOCUS', $seq->id(),$len,
|
|
608 (lc($alpha) eq 'protein') ? ('aa','', '') :
|
|
609 ('bp', '',$mol),$circular,
|
|
610 $div,$date);
|
|
611 }
|
|
612
|
|
613 $self->_print("$temp_line\n");
|
|
614 $self->_write_line_GenBank_regex("DEFINITION ", " ",
|
|
615 $seq->desc(),"\\s\+\|\$",80);
|
|
616
|
|
617 # if there, write the accession line
|
|
618
|
|
619 if( $self->_ac_generation_func ) {
|
|
620 $temp_line = &{$self->_ac_generation_func}($seq);
|
|
621 $self->_print("ACCESSION $temp_line\n");
|
|
622 } else {
|
|
623 my @acc = ();
|
|
624 push(@acc, $seq->accession_number());
|
|
625 if( $seq->isa('Bio::Seq::RichSeqI') ) {
|
|
626 push(@acc, $seq->get_secondary_accessions());
|
|
627 }
|
|
628 $self->_print("ACCESSION ", join(" ", @acc), "\n");
|
|
629 # otherwise - cannot print <sigh>
|
|
630 }
|
|
631
|
|
632 # if PID defined, print it
|
|
633 if($seq->isa('Bio::Seq::RichSeqI') && $seq->pid()) {
|
|
634 $self->_print("PID ", $seq->pid(), "\n");
|
|
635 }
|
|
636
|
|
637 # if there, write the version line
|
|
638
|
|
639 if( defined $self->_sv_generation_func() ) {
|
|
640 $temp_line = &{$self->_sv_generation_func}($seq);
|
|
641 if( $temp_line ) {
|
|
642 $self->_print("VERSION $temp_line\n");
|
|
643 }
|
|
644 } else {
|
|
645 if($seq->isa('Bio::Seq::RichSeqI') && defined($seq->seq_version)) {
|
|
646 my $id = $seq->primary_id(); # this may be a GI number
|
|
647 $self->_print("VERSION ",
|
|
648 $seq->accession_number(), ".", $seq->seq_version,
|
|
649 ($id && ($id =~ /^\d+$/) ? " GI:".$id : ""),
|
|
650 "\n");
|
|
651 }
|
|
652 }
|
|
653
|
|
654 # if there, write the keywords line
|
|
655
|
|
656 if( defined $self->_kw_generation_func() ) {
|
|
657 $temp_line = &{$self->_kw_generation_func}($seq);
|
|
658 $self->_print("KEYWORDS $temp_line\n");
|
|
659 } else {
|
|
660 if( $seq->can('keywords') ) {
|
|
661 my $kw = $seq->keywords;
|
|
662 if( ref($kw) =~ /ARRAY/i ) {
|
|
663 $kw = join("; ", @$kw);
|
|
664 }
|
|
665 $kw .= '.' if( $kw !~ /\.$/ );
|
|
666 $self->_print("KEYWORDS $kw\n");
|
|
667 }
|
|
668 }
|
|
669
|
|
670 # SEGMENT if it exists
|
|
671 foreach my $ref ( $seq->annotation->get_Annotations('segment') ) {
|
|
672 $self->_print(sprintf ("%-11s %s\n",'SEGMENT',
|
|
673 $ref->value));
|
|
674 }
|
|
675
|
|
676 # Organism lines
|
|
677 if (my $spec = $seq->species) {
|
|
678 my ($species, $genus, @class) = $spec->classification();
|
|
679 my $OS;
|
|
680 if( $spec->common_name ) {
|
|
681 $OS = $spec->common_name;
|
|
682 } else {
|
|
683 $OS = "$genus $species";
|
|
684 }
|
|
685 if (my $ssp = $spec->sub_species) {
|
|
686 $OS .= " $ssp";
|
|
687 }
|
|
688 $self->_print("SOURCE $OS\n");
|
|
689 $self->_print(" ORGANISM ",
|
|
690 ($spec->organelle() ? $spec->organelle()." " : ""),
|
|
691 "$genus $species", "\n");
|
|
692 my $OC = join('; ', (reverse(@class), $genus)) .'.';
|
|
693 $self->_write_line_GenBank_regex(' 'x12,' 'x12,
|
|
694 $OC,"\\s\+\|\$",80);
|
|
695 }
|
|
696
|
|
697 # Reference lines
|
|
698 my $count = 1;
|
|
699 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
|
|
700 $temp_line = sprintf ("REFERENCE $count (%s %d to %d)",
|
|
701 ($seq->alphabet() eq "protein" ?
|
|
702 "residues" : "bases"),
|
|
703 $ref->start,$ref->end);
|
|
704 $self->_print("$temp_line\n");
|
|
705 $self->_write_line_GenBank_regex(" AUTHORS ",' 'x12,
|
|
706 $ref->authors,"\\s\+\|\$",80);
|
|
707 $self->_write_line_GenBank_regex(" TITLE "," "x12,
|
|
708 $ref->title,"\\s\+\|\$",80);
|
|
709 $self->_write_line_GenBank_regex(" JOURNAL "," "x12,
|
|
710 $ref->location,"\\s\+\|\$",80);
|
|
711 if ($ref->comment) {
|
|
712 $self->_write_line_GenBank_regex(" REMARK "," "x12,
|
|
713 $ref->comment,"\\s\+\|\$",80);
|
|
714 }
|
|
715 if( $ref->medline) {
|
|
716 $self->_write_line_GenBank_regex(" MEDLINE "," "x12,
|
|
717 $ref->medline, "\\s\+\|\$",80);
|
|
718 # I am assuming that pubmed entries only exist when there
|
|
719 # are also MEDLINE entries due to the indentation
|
|
720 # This could be a wrong assumption
|
|
721 if( $ref->pubmed ) {
|
|
722 $self->_write_line_GenBank_regex(" PUBMED "," "x12,
|
|
723 $ref->pubmed, "\\s\+\|\$",
|
|
724 80);
|
|
725 }
|
|
726 }
|
|
727 $count++;
|
|
728 }
|
|
729 # Comment lines
|
|
730
|
|
731 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
|
|
732 $self->_write_line_GenBank_regex("COMMENT "," "x12,
|
|
733 $comment->text,"\\s\+\|\$",80);
|
|
734 }
|
|
735 $self->_print("FEATURES Location/Qualifiers\n");
|
|
736
|
|
737 my $contig;
|
|
738 if( defined $self->_post_sort ) {
|
|
739 # we need to read things into an array. Process. Sort them. Print 'em
|
|
740
|
|
741 my $post_sort_func = $self->_post_sort();
|
|
742 my @fth;
|
|
743
|
|
744 foreach my $sf ( $seq->top_SeqFeatures ) {
|
|
745 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
|
|
746 }
|
|
747
|
|
748 @fth = sort { &$post_sort_func($a,$b) } @fth;
|
|
749
|
|
750 foreach my $fth ( @fth ) {
|
|
751 $self->_print_GenBank_FTHelper($fth);
|
|
752 }
|
|
753 } else {
|
|
754 # not post sorted. And so we can print as we get them.
|
|
755 # lower memory load...
|
|
756
|
|
757 foreach my $sf ( $seq->top_SeqFeatures ) {
|
|
758 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
|
|
759 foreach my $fth ( @fth ) {
|
|
760 if( ! $fth->isa('Bio::SeqIO::FTHelper') ) {
|
|
761 $sf->throw("Cannot process FTHelper... $fth");
|
|
762 }
|
|
763 $self->_print_GenBank_FTHelper($fth);
|
|
764 }
|
|
765 }
|
|
766 }
|
|
767 if( $seq->length == 0 ) { $self->_show_dna(0) }
|
|
768
|
|
769 if( $self->_show_dna() == 0 ) {
|
|
770 $self->_print("\n//\n");
|
|
771 return;
|
|
772 }
|
|
773
|
|
774 # finished printing features.
|
|
775
|
|
776 $str =~ tr/A-Z/a-z/;
|
|
777
|
|
778 # Count each nucleotide
|
|
779 unless( $mol eq 'protein' ) {
|
|
780 my $alen = $str =~ tr/a/a/;
|
|
781 my $clen = $str =~ tr/c/c/;
|
|
782 my $glen = $str =~ tr/g/g/;
|
|
783 my $tlen = $str =~ tr/t/t/;
|
|
784
|
|
785 my $olen = $len - ($alen + $tlen + $clen + $glen);
|
|
786 if( $olen < 0 ) {
|
|
787 $self->warn("Weird. More atgc than bases. Problem!");
|
|
788 }
|
|
789
|
|
790 my $base_count = sprintf("BASE COUNT %8s a %6s c %6s g %6s t%s\n",
|
|
791 $alen,$clen,$glen,$tlen,
|
|
792 ( $olen > 0 ) ? sprintf("%6s others",$olen) : '');
|
|
793 $self->_print($base_count);
|
|
794 }
|
|
795 my ($o) = $seq->annotation->get_Annotations('origin');
|
|
796 $self->_print(sprintf("%-6s%s\n",'ORIGIN',$o ? $o->value : ''));
|
|
797
|
|
798 # print out the sequence
|
|
799 my $nuc = 60; # Number of nucleotides per line
|
|
800 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
|
|
801 my $out_pat = 'A11' x 6; # Pattern for packing a line
|
|
802 my $length = length($str);
|
|
803
|
|
804 # Calculate the number of nucleotides which fit on whole lines
|
|
805 my $whole = int($length / $nuc) * $nuc;
|
|
806
|
|
807 # Print the whole lines
|
|
808 my $i;
|
|
809 for ($i = 0; $i < $whole; $i += $nuc) {
|
|
810 my $blocks = pack $out_pat,
|
|
811 unpack $whole_pat,
|
|
812 substr($str, $i, $nuc);
|
|
813 chop $blocks;
|
|
814 $self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59));
|
|
815 }
|
|
816
|
|
817 # Print the last line
|
|
818 if (my $last = substr($str, $i)) {
|
|
819 my $last_len = length($last);
|
|
820 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
|
|
821 my $blocks = pack $out_pat,
|
|
822 unpack($last_pat, $last);
|
|
823 $blocks =~ s/ +$//;
|
|
824 $self->_print(sprintf("%9d $blocks\n", $length - $last_len + 1));
|
|
825 }
|
|
826
|
|
827 $self->_print("//\n");
|
|
828
|
|
829 $self->flush if $self->_flush_on_write && defined $self->_fh;
|
|
830 return 1;
|
|
831 }
|
|
832 }
|
|
833
|
|
834 =head2 _print_GenBank_FTHelper
|
|
835
|
|
836 Title : _print_GenBank_FTHelper
|
|
837 Usage :
|
|
838 Function:
|
|
839 Example :
|
|
840 Returns :
|
|
841 Args :
|
|
842
|
|
843
|
|
844 =cut
|
|
845
|
|
846 sub _print_GenBank_FTHelper {
|
|
847 my ($self,$fth,$always_quote) = @_;
|
|
848
|
|
849 if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
|
|
850 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
|
|
851 }
|
|
852 if( defined $fth->key &&
|
|
853 $fth->key eq 'CONTIG' ) {
|
|
854 $self->_write_line_GenBank_regex(sprintf("%-12s",$fth->key),
|
|
855 ' 'x12,$fth->loc,"\,\|\$",80);
|
|
856 } else {
|
|
857 $self->_write_line_GenBank_regex(sprintf(" %-16s",$fth->key),
|
|
858 " "x21,
|
|
859 $fth->loc,"\,\|\$",80);
|
|
860 }
|
|
861
|
|
862 if( !defined $always_quote) { $always_quote = 0; }
|
|
863
|
|
864 foreach my $tag ( keys %{$fth->field} ) {
|
|
865 foreach my $value ( @{$fth->field->{$tag}} ) {
|
|
866 $value =~ s/\"/\"\"/g;
|
|
867 if ($value eq "_no_value") {
|
|
868 $self->_write_line_GenBank_regex(" "x21,
|
|
869 " "x21,
|
|
870 "/$tag","\.\|\$",80);
|
|
871 }
|
|
872 elsif( $always_quote == 1 || $value !~ /^\d+$/ ) {
|
|
873 my ($pat) = ($value =~ /\s/ ? '\s|$' : '.|$');
|
|
874 $self->_write_line_GenBank_regex(" "x21,
|
|
875 " "x21,
|
|
876 "/$tag=\"$value\"",$pat,80);
|
|
877 } else {
|
|
878 $self->_write_line_GenBank_regex(" "x21,
|
|
879 " "x21,
|
|
880 "/$tag=$value","\.\|\$",80);
|
|
881 }
|
|
882 }
|
|
883 }
|
|
884
|
|
885 }
|
|
886
|
|
887
|
|
888 =head2 _read_GenBank_References
|
|
889
|
|
890 Title : _read_GenBank_References
|
|
891 Usage :
|
|
892 Function: Reads references from GenBank format. Internal function really
|
|
893 Returns :
|
|
894 Args :
|
|
895
|
|
896
|
|
897 =cut
|
|
898
|
|
899 sub _read_GenBank_References{
|
|
900 my ($self,$buffer) = @_;
|
|
901 my (@refs);
|
|
902 my $ref;
|
|
903
|
|
904 # assumme things are starting with RN
|
|
905
|
|
906 if( $$buffer !~ /^REFERENCE/ ) {
|
|
907 warn("Not parsing line '$$buffer' which maybe important");
|
|
908 }
|
|
909
|
|
910 $_ = $$buffer;
|
|
911
|
|
912 my (@title,@loc,@authors,@com,@medline,@pubmed);
|
|
913
|
|
914 REFLOOP: while( defined($_) || defined($_ = $self->_readline) ) {
|
|
915 if (/^ AUTHORS\s+(.*)/) {
|
|
916 push (@authors, $1);
|
|
917 while ( defined($_ = $self->_readline) ) {
|
|
918 /^\s{3,}(.*)/ && do { push (@authors, $1);next;};
|
|
919 last;
|
|
920 }
|
|
921 $ref->authors(join(' ', @authors));
|
|
922 }
|
|
923 if (/^ TITLE\s+(.*)/) {
|
|
924 push (@title, $1);
|
|
925 while ( defined($_ = $self->_readline) ) {
|
|
926 /^\s{3,}(.*)/ && do { push (@title, $1);
|
|
927 next;
|
|
928 };
|
|
929 last;
|
|
930 }
|
|
931 $ref->title(join(' ', @title));
|
|
932 }
|
|
933 if (/^ JOURNAL\s+(.*)/) {
|
|
934 push(@loc, $1);
|
|
935 while ( defined($_ = $self->_readline) ) {
|
|
936 /^\s{3,}(.*)/ && do { push(@loc, $1);
|
|
937 next;
|
|
938 };
|
|
939 last;
|
|
940 }
|
|
941 $ref->location(join(' ', @loc));
|
|
942 redo REFLOOP;
|
|
943 }
|
|
944 if (/^ REMARK\s+(.*)/) {
|
|
945 push (@com, $1);
|
|
946 while ( defined($_ = $self->_readline) ) {
|
|
947 /^\s{3,}(.*)/ && do { push(@com, $1);
|
|
948 next;
|
|
949 };
|
|
950 last;
|
|
951 }
|
|
952 $ref->comment(join(' ', @com));
|
|
953 redo REFLOOP;
|
|
954 }
|
|
955 if( /^ MEDLINE\s+(.*)/ ) {
|
|
956 push(@medline,$1);
|
|
957 while ( defined($_ = $self->_readline) ) {
|
|
958 /^\s{4,}(.*)/ && do { push(@medline, $1);
|
|
959 next;
|
|
960 };
|
|
961 last;
|
|
962 }
|
|
963 $ref->medline(join(' ', @medline));
|
|
964 redo REFLOOP;
|
|
965 }
|
|
966 if( /^ PUBMED\s+(.*)/ ) {
|
|
967 push(@pubmed,$1);
|
|
968 while ( defined($_ = $self->_readline) ) {
|
|
969 /^\s{5,}(.*)/ && do { push(@pubmed, $1);
|
|
970 next;
|
|
971 };
|
|
972 last;
|
|
973 }
|
|
974 $ref->pubmed(join(' ', @pubmed));
|
|
975 redo REFLOOP;
|
|
976 }
|
|
977
|
|
978 /^REFERENCE/ && do {
|
|
979 # store current reference
|
|
980 $self->_add_ref_to_array(\@refs,$ref) if $ref;
|
|
981 # reset
|
|
982 @authors = ();
|
|
983 @title = ();
|
|
984 @loc = ();
|
|
985 @com = ();
|
|
986 @pubmed = ();
|
|
987 @medline = ();
|
|
988 # create the new reference object
|
|
989 $ref = Bio::Annotation::Reference->new();
|
|
990 # check whether start and end base is given
|
|
991 if (/^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)/){
|
|
992 $ref->start($1);
|
|
993 $ref->end($2);
|
|
994 }
|
|
995 };
|
|
996
|
|
997 /^(FEATURES)|(COMMENT)/ && last;
|
|
998
|
|
999 $_ = undef; # Empty $_ to trigger read of next line
|
|
1000 }
|
|
1001
|
|
1002 # store last reference
|
|
1003 $self->_add_ref_to_array(\@refs,$ref) if $ref;
|
|
1004
|
|
1005 $$buffer = $_;
|
|
1006
|
|
1007 #print "\nnumber of references found: ", $#refs+1,"\n";
|
|
1008
|
|
1009 return @refs;
|
|
1010 }
|
|
1011
|
|
1012 #
|
|
1013 # This is undocumented as it shouldn't be called by anywhere else as
|
|
1014 # read_GenBank_References. For those who still want to know:
|
|
1015 #
|
|
1016 # Purpose: adds a Reference object to an array of Reference objects, takes
|
|
1017 # care of possible cleanups to be done (currently, only author and title
|
|
1018 # will be chopped of trailing semicolons).
|
|
1019 # Parameters:
|
|
1020 # a reference to an array of Reference objects
|
|
1021 # the Reference object to be added
|
|
1022 # Returns: nothing
|
|
1023 #
|
|
1024 sub _add_ref_to_array {
|
|
1025 my ($self, $refs, $ref) = @_;
|
|
1026
|
|
1027 # first, polish author and title by removing possible trailing semicolons
|
|
1028 my $au = $ref->authors();
|
|
1029 my $title = $ref->title();
|
|
1030 $au =~ s/;\s*$//g if $au;
|
|
1031 $title =~ s/;\s*$//g if $title;
|
|
1032 $ref->authors($au);
|
|
1033 $ref->title($title);
|
|
1034 # the rest should be clean already, so go ahead and add it
|
|
1035 push(@{$refs}, $ref);
|
|
1036 }
|
|
1037
|
|
1038 =head2 _read_GenBank_Species
|
|
1039
|
|
1040 Title : _read_GenBank_Species
|
|
1041 Usage :
|
|
1042 Function: Reads the GenBank Organism species and classification
|
|
1043 lines.
|
|
1044 Example :
|
|
1045 Returns : A Bio::Species object
|
|
1046 Args : a reference to the current line buffer
|
|
1047
|
|
1048 =cut
|
|
1049
|
|
1050 sub _read_GenBank_Species {
|
|
1051 my( $self,$buffer) = @_;
|
|
1052 my @organell_names = ("chloroplast", "mitochondr");
|
|
1053 # only those carrying DNA, apart from the nucleus
|
|
1054
|
|
1055 $_ = $$buffer;
|
|
1056
|
|
1057 my( $sub_species, $species, $genus, $common, $organelle, @class );
|
|
1058 # upon first entering the loop, we must not read a new line -- the SOURCE
|
|
1059 # line is already in the buffer (HL 05/10/2000)
|
|
1060 while (defined($_) || defined($_ = $self->_readline())) {
|
|
1061 # de-HTMLify (links that may be encountered here don't contain
|
|
1062 # escaped '>', so a simple-minded approach suffices)
|
|
1063 s/<[^>]+>//g;
|
|
1064 if (/^SOURCE\s+(.*)/) {
|
|
1065 # FIXME this is probably mostly wrong (e.g., it yields things like
|
|
1066 # Homo sapiens adult placenta cDNA to mRNA
|
|
1067 # which is certainly not what you want)
|
|
1068 $common = $1;
|
|
1069 $common =~ s/\.$//; # remove trailing dot
|
|
1070 } elsif (/^\s+ORGANISM/) {
|
|
1071 my @spflds = split(' ', $_);
|
|
1072 shift(@spflds); # ORGANISM
|
|
1073 if(grep { $_ =~ /^$spflds[0]/i; } @organell_names) {
|
|
1074 $organelle = shift(@spflds);
|
|
1075 }
|
|
1076 $genus = shift(@spflds);
|
|
1077 if(@spflds) {
|
|
1078 $species = shift(@spflds);
|
|
1079 } else {
|
|
1080 $species = "sp.";
|
|
1081 }
|
|
1082 $sub_species = shift(@spflds) if(@spflds);
|
|
1083 } elsif (/^\s+(.+)/) {
|
|
1084 # only split on ';' or '.' so that
|
|
1085 # classification that is 2 words will
|
|
1086 # still get matched
|
|
1087 # use map to remove trailing/leading spaces
|
|
1088 push(@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/, $1);
|
|
1089 } else {
|
|
1090 last;
|
|
1091 }
|
|
1092
|
|
1093 $_ = undef; # Empty $_ to trigger read of next line
|
|
1094 }
|
|
1095
|
|
1096 $$buffer = $_;
|
|
1097
|
|
1098 # Don't make a species object if it's empty or "Unknown" or "None"
|
|
1099 return unless $genus and $genus !~ /^(Unknown|None)$/i;
|
|
1100
|
|
1101 # Bio::Species array needs array in Species -> Kingdom direction
|
|
1102 if ($class[$#class] eq $genus) {
|
|
1103 push( @class, $species );
|
|
1104 } else {
|
|
1105 push( @class, $genus, $species );
|
|
1106 }
|
|
1107 @class = reverse @class;
|
|
1108
|
|
1109 my $make = Bio::Species->new();
|
|
1110 $make->classification( \@class, "FORCE" ); # no name validation please
|
|
1111 $make->common_name( $common ) if $common;
|
|
1112 $make->sub_species( $sub_species ) if $sub_species;
|
|
1113 $make->organelle($organelle) if $organelle;
|
|
1114 return $make;
|
|
1115 }
|
|
1116
|
|
1117 =head2 _read_FTHelper_GenBank
|
|
1118
|
|
1119 Title : _read_FTHelper_GenBank
|
|
1120 Usage : _read_FTHelper_GenBank($buffer)
|
|
1121 Function: reads the next FT key line
|
|
1122 Example :
|
|
1123 Returns : Bio::SeqIO::FTHelper object
|
|
1124 Args : filehandle and reference to a scalar
|
|
1125
|
|
1126
|
|
1127 =cut
|
|
1128
|
|
1129 sub _read_FTHelper_GenBank {
|
|
1130 my ($self,$buffer) = @_;
|
|
1131
|
|
1132 my ($key, # The key of the feature
|
|
1133 $loc # The location line from the feature
|
|
1134 );
|
|
1135 my @qual = (); # An arrray of lines making up the qualifiers
|
|
1136
|
|
1137 if ($$buffer =~ /^ (\S+)\s+(.+?)\s*$/o) {
|
|
1138 $key = $1;
|
|
1139 $loc = $2;
|
|
1140 # Read all the lines up to the next feature
|
|
1141 while ( defined($_ = $self->_readline) ) {
|
|
1142 if (/^(\s+)(.+?)\s*$/o) {
|
|
1143 # Lines inside features are preceded by 21 spaces
|
|
1144 # A new feature is preceded by 5 spaces
|
|
1145 if (length($1) > 6) {
|
|
1146 # Add to qualifiers if we're in the qualifiers, or if it's
|
|
1147 # the first qualifier
|
|
1148 if (@qual || (index($2,'/') == 0)) {
|
|
1149 push(@qual, $2);
|
|
1150 }
|
|
1151 # We're still in the location line, so append to location
|
|
1152 else {
|
|
1153 $loc .= $2;
|
|
1154 }
|
|
1155 } else {
|
|
1156 # We've reached the start of the next feature
|
|
1157 last;
|
|
1158 }
|
|
1159 } else {
|
|
1160 # We're at the end of the feature table
|
|
1161 last;
|
|
1162 }
|
|
1163 }
|
|
1164 } else {
|
|
1165 # No feature key
|
|
1166 $self->debug("no feature key!\n");
|
|
1167 # change suggested by JDiggans to avoid infinite loop-
|
|
1168 # see bugreport 1062.
|
|
1169 # reset buffer to prevent infinite loop
|
|
1170 $$buffer = $self->_readline();
|
|
1171 return;
|
|
1172 }
|
|
1173
|
|
1174 # Put the first line of the next feature into the buffer
|
|
1175 $$buffer = $_;
|
|
1176
|
|
1177 # Make the new FTHelper object
|
|
1178 my $out = new Bio::SeqIO::FTHelper();
|
|
1179 $out->verbose($self->verbose());
|
|
1180 $out->key($key);
|
|
1181 $out->loc($loc);
|
|
1182
|
|
1183 # Now parse and add any qualifiers. (@qual is kept
|
|
1184 # intact to provide informative error messages.)
|
|
1185 QUAL: for (my $i = 0; $i < @qual; $i++) {
|
|
1186 $_ = $qual[$i];
|
|
1187 my( $qualifier, $value ) = (m{^/([^=]+)(?:=(.+))?})
|
|
1188 or $self->warn("cannot see new qualifier in feature $key: ".
|
|
1189 $qual[$i]);
|
|
1190 #or $self->throw("Can't see new qualifier in: $_\nfrom:\n"
|
|
1191 # . join('', map "$_\n", @qual));
|
|
1192 $qualifier = '' unless( defined $qualifier);
|
|
1193 if (defined $value) {
|
|
1194 # Do we have a quoted value?
|
|
1195 if (substr($value, 0, 1) eq '"') {
|
|
1196 # Keep adding to value until we find the trailing quote
|
|
1197 # and the quotes are balanced
|
|
1198 while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) {
|
|
1199 if($i >= $#qual) {
|
|
1200 $self->warn("Unbalanced quote in:\n" .
|
|
1201 join('', map("$_\n", @qual)) .
|
|
1202 "No further qualifiers will " .
|
|
1203 "be added for this feature");
|
|
1204 last QUAL;
|
|
1205 }
|
|
1206 $i++; # modifying a for-loop variable inside of the loop
|
|
1207 # is not the best programming style ...
|
|
1208 my $next = $qual[$i];
|
|
1209
|
|
1210 # add to value with a space unless the value appears
|
|
1211 # to be a sequence (translation for example)
|
|
1212 if(($value.$next) =~ /[^A-Za-z"-]/) {
|
|
1213 $value .= " ";
|
|
1214 }
|
|
1215 $value .= $next;
|
|
1216 }
|
|
1217 # Trim leading and trailing quotes
|
|
1218 $value =~ s/^"|"$//g;
|
|
1219 # Undouble internal quotes
|
|
1220 $value =~ s/""/\"/g;
|
|
1221 }
|
|
1222 } else {
|
|
1223 $value = '_no_value';
|
|
1224 }
|
|
1225 # Store the qualifier
|
|
1226 $out->field->{$qualifier} ||= [];
|
|
1227 push(@{$out->field->{$qualifier}},$value);
|
|
1228 }
|
|
1229 return $out;
|
|
1230 }
|
|
1231
|
|
1232 =head2 _write_line_GenBank
|
|
1233
|
|
1234 Title : _write_line_GenBank
|
|
1235 Usage :
|
|
1236 Function: internal function
|
|
1237 Example :
|
|
1238 Returns :
|
|
1239 Args :
|
|
1240
|
|
1241
|
|
1242 =cut
|
|
1243
|
|
1244 sub _write_line_GenBank{
|
|
1245 my ($self,$pre1,$pre2,$line,$length) = @_;
|
|
1246
|
|
1247 $length || $self->throw("Miscalled write_line_GenBank without length. Programming error!");
|
|
1248 my $subl = $length - length $pre2;
|
|
1249 my $linel = length $line;
|
|
1250 my $i;
|
|
1251
|
|
1252 my $sub = substr($line,0,$length - length $pre1);
|
|
1253
|
|
1254 $self->_print("$pre1$sub\n");
|
|
1255
|
|
1256 for($i= ($length - length $pre1);$i < $linel;) {
|
|
1257 $sub = substr($line,$i,($subl));
|
|
1258 $self->_print("$pre2$sub\n");
|
|
1259 $i += $subl;
|
|
1260 }
|
|
1261
|
|
1262 }
|
|
1263
|
|
1264 =head2 _write_line_GenBank_regex
|
|
1265
|
|
1266 Title : _write_line_GenBank_regex
|
|
1267 Usage :
|
|
1268 Function: internal function for writing lines of specified
|
|
1269 length, with different first and the next line
|
|
1270 left hand headers and split at specific points in the
|
|
1271 text
|
|
1272 Example :
|
|
1273 Returns : nothing
|
|
1274 Args : file handle, first header, second header, text-line, regex for line breaks, total line length
|
|
1275
|
|
1276
|
|
1277 =cut
|
|
1278
|
|
1279 sub _write_line_GenBank_regex {
|
|
1280 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
|
|
1281
|
|
1282 #print STDOUT "Going to print with $line!\n";
|
|
1283
|
|
1284 $length || $self->throw( "Miscalled write_line_GenBank without length. Programming error!");
|
|
1285
|
|
1286 # if( length $pre1 != length $pre2 ) {
|
|
1287 # $self->throw( "Programming error - cannot called write_line_GenBank_regex with different length pre1 and pre2 tags!");
|
|
1288 # }
|
|
1289
|
|
1290 my $subl = $length - (length $pre1) - 2;
|
|
1291 my @lines = ();
|
|
1292
|
|
1293 CHUNK: while($line) {
|
|
1294 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
|
|
1295 if($line =~ m/^(.{1,$subl})($pat)(.*)/) {
|
|
1296 $line = $3;
|
|
1297 # be strict about not padding spaces according to
|
|
1298 # genbank format
|
|
1299 my $l = $1.$2;
|
|
1300 $l =~ s/\s+$//;
|
|
1301 push(@lines, $l);
|
|
1302 next CHUNK;
|
|
1303 }
|
|
1304 }
|
|
1305 # if we get here none of the patterns matched $subl or less chars
|
|
1306 $self->warn("trouble dissecting \"$line\" into chunks ".
|
|
1307 "of $subl chars or less - this tag won't print right");
|
|
1308 # insert a space char to prevent infinite loops
|
|
1309 $line = substr($line,0,$subl) . " " . substr($line,$subl);
|
|
1310 }
|
|
1311
|
|
1312 my $s = shift @lines;
|
|
1313 $self->_print("$pre1$s\n");
|
|
1314 foreach my $s ( @lines ) {
|
|
1315 $self->_print("$pre2$s\n");
|
|
1316 }
|
|
1317 }
|
|
1318
|
|
1319 =head2 _post_sort
|
|
1320
|
|
1321 Title : _post_sort
|
|
1322 Usage : $obj->_post_sort($newval)
|
|
1323 Function:
|
|
1324 Returns : value of _post_sort
|
|
1325 Args : newvalue (optional)
|
|
1326
|
|
1327
|
|
1328 =cut
|
|
1329
|
|
1330 sub _post_sort{
|
|
1331 my ($obj,$value) = @_;
|
|
1332 if( defined $value) {
|
|
1333 $obj->{'_post_sort'} = $value;
|
|
1334 }
|
|
1335 return $obj->{'_post_sort'};
|
|
1336 }
|
|
1337
|
|
1338 =head2 _show_dna
|
|
1339
|
|
1340 Title : _show_dna
|
|
1341 Usage : $obj->_show_dna($newval)
|
|
1342 Function:
|
|
1343 Returns : value of _show_dna
|
|
1344 Args : newvalue (optional)
|
|
1345
|
|
1346
|
|
1347 =cut
|
|
1348
|
|
1349 sub _show_dna{
|
|
1350 my ($obj,$value) = @_;
|
|
1351 if( defined $value) {
|
|
1352 $obj->{'_show_dna'} = $value;
|
|
1353 }
|
|
1354 return $obj->{'_show_dna'};
|
|
1355 }
|
|
1356
|
|
1357 =head2 _id_generation_func
|
|
1358
|
|
1359 Title : _id_generation_func
|
|
1360 Usage : $obj->_id_generation_func($newval)
|
|
1361 Function:
|
|
1362 Returns : value of _id_generation_func
|
|
1363 Args : newvalue (optional)
|
|
1364
|
|
1365
|
|
1366 =cut
|
|
1367
|
|
1368 sub _id_generation_func{
|
|
1369 my ($obj,$value) = @_;
|
|
1370 if( defined $value ) {
|
|
1371 $obj->{'_id_generation_func'} = $value;
|
|
1372 }
|
|
1373 return $obj->{'_id_generation_func'};
|
|
1374 }
|
|
1375
|
|
1376 =head2 _ac_generation_func
|
|
1377
|
|
1378 Title : _ac_generation_func
|
|
1379 Usage : $obj->_ac_generation_func($newval)
|
|
1380 Function:
|
|
1381 Returns : value of _ac_generation_func
|
|
1382 Args : newvalue (optional)
|
|
1383
|
|
1384
|
|
1385 =cut
|
|
1386
|
|
1387 sub _ac_generation_func{
|
|
1388 my ($obj,$value) = @_;
|
|
1389 if( defined $value ) {
|
|
1390 $obj->{'_ac_generation_func'} = $value;
|
|
1391 }
|
|
1392 return $obj->{'_ac_generation_func'};
|
|
1393 }
|
|
1394
|
|
1395 =head2 _sv_generation_func
|
|
1396
|
|
1397 Title : _sv_generation_func
|
|
1398 Usage : $obj->_sv_generation_func($newval)
|
|
1399 Function:
|
|
1400 Returns : value of _sv_generation_func
|
|
1401 Args : newvalue (optional)
|
|
1402
|
|
1403
|
|
1404 =cut
|
|
1405
|
|
1406 sub _sv_generation_func{
|
|
1407 my ($obj,$value) = @_;
|
|
1408 if( defined $value ) {
|
|
1409 $obj->{'_sv_generation_func'} = $value;
|
|
1410 }
|
|
1411 return $obj->{'_sv_generation_func'};
|
|
1412
|
|
1413 }
|
|
1414
|
|
1415 =head2 _kw_generation_func
|
|
1416
|
|
1417 Title : _kw_generation_func
|
|
1418 Usage : $obj->_kw_generation_func($newval)
|
|
1419 Function:
|
|
1420 Returns : value of _kw_generation_func
|
|
1421 Args : newvalue (optional)
|
|
1422
|
|
1423
|
|
1424 =cut
|
|
1425
|
|
1426 sub _kw_generation_func{
|
|
1427 my ($obj,$value) = @_;
|
|
1428 if( defined $value ) {
|
|
1429 $obj->{'_kw_generation_func'} = $value;
|
|
1430 }
|
|
1431 return $obj->{'_kw_generation_func'};
|
|
1432 }
|
|
1433
|
|
1434 1;
|