Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqIO/genbank.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: genbank.pm,v 1.76.2.12 2003/09/13 23:33:04 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::SeqIO::GenBank | |
4 # | |
5 # Cared for by Elia Stupka <elia@tll.org.sg> | |
6 # | |
7 # Copyright Elia Stupka | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SeqIO::GenBank - GenBank sequence input/output stream | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 It is probably best not to use this object directly, but | |
20 rather go through the SeqIO handler system. Go: | |
21 | |
22 $stream = Bio::SeqIO->new(-file => $filename, -format => 'GenBank'); | |
23 | |
24 while ( my $seq = $stream->next_seq() ) { | |
25 # do something with $seq | |
26 } | |
27 | |
28 | |
29 =head1 DESCRIPTION | |
30 | |
31 This object can transform Bio::Seq objects to and from GenBank flat | |
32 file databases. | |
33 | |
34 There is alot of flexibility here about how to dump things which I need | |
35 to document fully. | |
36 | |
37 =head2 Mapping of record properties to object properties | |
38 | |
39 This section is supposed to document which sections and properties of | |
40 a GenBank databank record end up where in the Bioperl object model. It | |
41 is far from complete and presently focuses only on those mappings | |
42 which may be non-obvious. $seq in the text refers to the | |
43 Bio::Seq::RichSeqI implementing object returned by the parser for each | |
44 record. | |
45 | |
46 =over 4 | |
47 | |
48 =item GI number | |
49 | |
50 $seq-E<gt>primary_id | |
51 | |
52 =back | |
53 | |
54 =head2 Optional functions | |
55 | |
56 =over 3 | |
57 | |
58 =item _show_dna() | |
59 | |
60 (output only) shows the dna or not | |
61 | |
62 =item _post_sort() | |
63 | |
64 (output only) provides a sorting func which is applied to the FTHelpers | |
65 before printing | |
66 | |
67 =item _id_generation_func() | |
68 | |
69 This is function which is called as | |
70 | |
71 print "ID ", $func($seq), "\n"; | |
72 | |
73 To generate the ID line. If it is not there, it generates a sensible ID | |
74 line using a number of tools. | |
75 | |
76 | |
77 If you want to output annotations in genbank format they need to be | |
78 stored in a Bio::Annotation::Collection object which is accessible | |
79 through the Bio::SeqI interface method L<annotation()|annotation>. | |
80 | |
81 The following are the names of the keys which are polled from a | |
82 L<Bio::Annotation::Collection> object. | |
83 | |
84 reference - Should contain Bio::Annotation::Reference objects | |
85 comment - Should contain Bio::Annotation::Comment objects | |
86 | |
87 segment - Should contain a Bio::Annotation::SimpleValue object | |
88 origin - Should contain a Bio::Annotation::SimpleValue object | |
89 | |
90 =back | |
91 | |
92 =head1 Where does the data go? | |
93 | |
94 Data parsed in Bio::SeqIO::genbank is stored in a variety of data | |
95 fields in the sequence object that is returned. More information in | |
96 the HOWTOs about exactly what each field means and where it goes. | |
97 Here is a partial list of fields. | |
98 | |
99 Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you | |
100 the top level object which defines a function called NAME() which | |
101 stores this information. | |
102 | |
103 Items listed as Annotation 'NAME' tell you the data is stored the | |
104 associated Bio::Annotation::Colection object which is associated with | |
105 Bio::Seq objects. If it is explictly requested that no annotations | |
106 should be stored when parsing a record of course they won't be | |
107 available when you try and get them. If you are having this problem | |
108 look at the type of SeqBuilder that is being used to contruct your | |
109 sequence object. | |
110 | |
111 Comments Annotation 'comment' | |
112 References Annotation 'reference' | |
113 Segment Annotation 'segment' | |
114 Origin Annotation 'origin' | |
115 | |
116 Accessions PrimarySeq accession_number() | |
117 Secondary accessions RichSeq get_secondary_accessions() | |
118 Keywords RichSeq keywords() | |
119 Dates RichSeq get_dates() | |
120 Molecule RichSeq molecule() | |
121 Seq Version RichSeq seq_version() | |
122 PID RichSeq pid() | |
123 Division RichSeq division() | |
124 Features Seq get_SeqFeatures() | |
125 Alphabet PrimarySeq alphabet() | |
126 Definition PrimarySeq description() or desc() | |
127 Version PrimarySeq version() | |
128 | |
129 Sequence PrimarySeq seq() | |
130 | |
131 =head1 FEEDBACK | |
132 | |
133 =head2 Mailing Lists | |
134 | |
135 User feedback is an integral part of the evolution of this | |
136 and other Bioperl modules. Send your comments and suggestions preferably | |
137 to one of the Bioperl mailing lists. | |
138 Your participation is much appreciated. | |
139 | |
140 bioperl-l@bioperl.org - General discussion | |
141 http://www.bioperl.org/MailList.shtml - About the mailing lists | |
142 | |
143 =head2 Reporting Bugs | |
144 | |
145 Report bugs to the Bioperl bug tracking system to help us keep track | |
146 the bugs and their resolution. | |
147 Bug reports can be submitted via email or the web: | |
148 | |
149 bioperl-bugs@bio.perl.org | |
150 http://bugzilla.bioperl.org/ | |
151 | |
152 =head1 AUTHOR - Elia Stupka | |
153 | |
154 Email elia@tll.org.sg | |
155 | |
156 =head1 CONTRIBUTORS | |
157 | |
158 Ewan Birney birney@ebi.ac.uk | |
159 Jason Stajich jason@bioperl.org | |
160 Chris Mungall cjm@fruitfly.bdgp.berkeley.edu | |
161 Lincoln Stein lstein@cshl.org | |
162 Heikki Lehvaslaiho, heikki@ebi.ac.uk | |
163 Hilmar Lapp, hlapp@gmx.net | |
164 Donald G. Jackson, donald.jackson@bms.com | |
165 | |
166 =head1 APPENDIX | |
167 | |
168 The rest of the documentation details each of the object | |
169 methods. Internal methods are usually preceded with a _ | |
170 | |
171 =cut | |
172 | |
173 # Let the code begin... | |
174 | |
175 package Bio::SeqIO::genbank; | |
176 use vars qw(@ISA); | |
177 use strict; | |
178 | |
179 use Bio::SeqIO; | |
180 use Bio::SeqIO::FTHelper; | |
181 use Bio::SeqFeature::Generic; | |
182 use Bio::Species; | |
183 use Bio::Seq::SeqFactory; | |
184 use Bio::Annotation::Collection; | |
185 use Bio::Annotation::Comment; | |
186 use Bio::Annotation::Reference; | |
187 use Bio::Annotation::DBLink; | |
188 | |
189 @ISA = qw(Bio::SeqIO); | |
190 | |
191 sub _initialize { | |
192 my($self,@args) = @_; | |
193 | |
194 $self->SUPER::_initialize(@args); | |
195 # hash for functions for decoding keys. | |
196 $self->{'_func_ftunit_hash'} = {}; | |
197 $self->_show_dna(1); # sets this to one by default. People can change it | |
198 if( ! defined $self->sequence_factory ) { | |
199 $self->sequence_factory(new Bio::Seq::SeqFactory | |
200 (-verbose => $self->verbose(), | |
201 -type => 'Bio::Seq::RichSeq')); | |
202 } | |
203 } | |
204 | |
205 =head2 next_seq | |
206 | |
207 Title : next_seq | |
208 Usage : $seq = $stream->next_seq() | |
209 Function: returns the next sequence in the stream | |
210 Returns : Bio::Seq object | |
211 Args : | |
212 | |
213 =cut | |
214 | |
215 sub next_seq { | |
216 my ($self,@args) = @_; | |
217 my $builder = $self->sequence_builder(); | |
218 my $seq; | |
219 my %params; | |
220 | |
221 RECORDSTART: while (1) { | |
222 my $buffer; | |
223 my (@acc, @features); | |
224 my ($display_id, $annotation); | |
225 my $species; | |
226 | |
227 # initialize; we may come here because of starting over | |
228 @features = (); | |
229 $annotation = undef; | |
230 @acc = (); | |
231 $species = undef; | |
232 %params = (-verbose => $self->verbose); # reset hash | |
233 local($/) = "\n"; | |
234 while(defined($buffer = $self->_readline())) { | |
235 last if index($buffer,'LOCUS ') == 0; | |
236 } | |
237 return undef if( !defined $buffer ); # end of file | |
238 $buffer =~ /^LOCUS\s+(\S.*)$/ || | |
239 $self->throw("GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'"); | |
240 | |
241 my @tokens = split(' ', $1); | |
242 | |
243 # this is important to have the id for display in e.g. FTHelper, | |
244 # otherwise you won't know which entry caused an error | |
245 $display_id = shift(@tokens); | |
246 $params{'-display_id'} = $display_id; | |
247 # may still be useful if we don't want the seq | |
248 $params{'-length'} = shift(@tokens); | |
249 # the alphabet of the entry | |
250 $params{'-alphabet'} = (lc(shift @tokens) eq 'bp') ? 'dna' : 'protein'; | |
251 # for aa there is usually no 'molecule' (mRNA etc) | |
252 if (($params{'-alphabet'} eq 'dna') || (@tokens > 2)) { | |
253 $params{'-molecule'} = shift(@tokens); | |
254 my $circ = shift(@tokens); | |
255 if ($circ eq 'circular') { | |
256 $params{'-is_circular'} = 1; | |
257 $params{'-division'} = shift(@tokens); | |
258 } else { | |
259 # 'linear' or 'circular' may actually be omitted altogether | |
260 $params{'-division'} = | |
261 (CORE::length($circ) == 3 ) ? $circ : shift(@tokens); | |
262 } | |
263 } else { | |
264 $params{'-molecule'} = 'PRT' if($params{'-alphabet'} eq 'aa'); | |
265 $params{'-division'} = shift(@tokens); | |
266 } | |
267 my $date = join(' ', @tokens); # we lump together the rest | |
268 # this is per request bug #1513 | |
269 # we can handle | |
270 # 9-10-2003 | |
271 # 9-10-03 | |
272 #09-10-2003 | |
273 #09-10-03 | |
274 if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) { | |
275 if( length($date) < 11 ) { # improperly formatted date | |
276 # But we'll be nice and fix it for them | |
277 my ($d,$m,$y) = ($2,$3,$4); | |
278 if( length($d) == 1 ) { | |
279 $d = "0$d"; | |
280 } | |
281 # guess the century here | |
282 if( length($y) == 2 ) { | |
283 if( $y > 60 ) { # arbitrarily guess that '60' means 1960 | |
284 $y = "19$y"; | |
285 } else { | |
286 $y = "20$y"; | |
287 } | |
288 $self->warn("Date was malformed, guessing the century for $date to be $y\n"); | |
289 } | |
290 $params{'-dates'} = [join('-',$d,$m,$y)]; | |
291 } else { | |
292 $params{'-dates'} = [$date]; | |
293 } | |
294 } | |
295 | |
296 # set them all at once | |
297 $builder->add_slot_value(%params); | |
298 %params = (); | |
299 | |
300 # parse the rest if desired, otherwise start over | |
301 if(! $builder->want_object()) { | |
302 $builder->make_object(); | |
303 next RECORDSTART; | |
304 } | |
305 | |
306 # set up annotation depending on what the builder wants | |
307 if($builder->want_slot('annotation')) { | |
308 $annotation = new Bio::Annotation::Collection; | |
309 } | |
310 $buffer = $self->_readline(); | |
311 until( !defined ($buffer) ) { | |
312 $_ = $buffer; | |
313 | |
314 # Description line(s) | |
315 if (/^DEFINITION\s+(\S.*\S)/) { | |
316 my @desc = ($1); | |
317 while ( defined($_ = $self->_readline) ) { | |
318 if( /^\s+(.*)/ ) { push (@desc, $1); next }; | |
319 last; | |
320 } | |
321 $builder->add_slot_value(-desc => join(' ', @desc)); | |
322 # we'll continue right here because DEFINITION always comes | |
323 # at the top of the entry | |
324 } | |
325 # accession number (there can be multiple accessions) | |
326 if( /^ACCESSION\s+(\S.*\S)/ ) { | |
327 push(@acc, split(/\s+/,$1)); | |
328 while( defined($_ = $self->_readline) ) { | |
329 /^\s+(.*)/ && do { push (@acc, split(/\s+/,$1)); next }; | |
330 last; | |
331 } | |
332 $buffer = $_; | |
333 next; | |
334 } | |
335 # PID | |
336 elsif( /^PID\s+(\S+)/ ) { | |
337 $params{'-pid'} = $1; | |
338 } | |
339 #Version number | |
340 elsif( /^VERSION\s+(.+)$/ ) { | |
341 my ($acc,$gi) = split(' ',$1); | |
342 if($acc =~ /^\w+\.(\d+)/) { | |
343 $params{'-version'} = $1; | |
344 $params{'-seq_version'} = $1; | |
345 } | |
346 if($gi && (index($gi,"GI:") == 0)) { | |
347 $params{'-primary_id'} = substr($gi,3); | |
348 } | |
349 } | |
350 #Keywords | |
351 elsif( /^KEYWORDS\s+(.*)/ ) { | |
352 my @kw = split(/\s*\;\s*/,$1); | |
353 while( defined($_ = $self->_readline) ) { | |
354 chomp; | |
355 /^\s+(.*)/ && do { push (@kw, split(/\s*\;\s*/,$1)); next }; | |
356 last; | |
357 } | |
358 | |
359 @kw && $kw[-1] =~ s/\.$//; | |
360 $params{'-keywords'} = \@kw; | |
361 $buffer = $_; | |
362 next; | |
363 } | |
364 # Organism name and phylogenetic information | |
365 elsif (/^SOURCE/) { | |
366 if($builder->want_slot('species')) { | |
367 $species = $self->_read_GenBank_Species(\$buffer); | |
368 $builder->add_slot_value(-species => $species); | |
369 } else { | |
370 while(defined($buffer = $self->_readline())) { | |
371 last if substr($buffer,0,1) ne ' '; | |
372 } | |
373 } | |
374 next; | |
375 } | |
376 #References | |
377 elsif (/^REFERENCE/) { | |
378 if($annotation) { | |
379 my @refs = $self->_read_GenBank_References(\$buffer); | |
380 foreach my $ref ( @refs ) { | |
381 $annotation->add_Annotation('reference',$ref); | |
382 } | |
383 } else { | |
384 while(defined($buffer = $self->_readline())) { | |
385 last if substr($buffer,0,1) ne ' '; | |
386 } | |
387 } | |
388 next; | |
389 } | |
390 #Comments | |
391 elsif (/^COMMENT\s+(.*)/) { | |
392 if($annotation) { | |
393 my $comment = $1; | |
394 while (defined($_ = $self->_readline)) { | |
395 last if (/^\S/); | |
396 $comment .= $_; | |
397 } | |
398 $comment =~ s/\n/ /g; | |
399 $comment =~ s/ +/ /g; | |
400 $annotation->add_Annotation( | |
401 'comment', | |
402 Bio::Annotation::Comment->new(-text => $comment)); | |
403 $buffer = $_; | |
404 } else { | |
405 while(defined($buffer = $self->_readline())) { | |
406 last if substr($buffer,0,1) ne ' '; | |
407 } | |
408 } | |
409 next; | |
410 } elsif( /^SEGMENT\s+(.+)/ ) { | |
411 if($annotation) { | |
412 my $segment = $1; | |
413 while (defined($_ = $self->_readline)) { | |
414 last if (/^\S/); | |
415 $segment .= $_; | |
416 } | |
417 $segment =~ s/\n/ /g; | |
418 $segment =~ s/ +/ /g; | |
419 $annotation->add_Annotation( | |
420 'segment', | |
421 Bio::Annotation::SimpleValue->new(-value => $segment)); | |
422 $buffer = $_; | |
423 } else { | |
424 while(defined($buffer = $self->_readline())) { | |
425 last if substr($buffer,0,1) ne ' '; | |
426 } | |
427 } | |
428 next; | |
429 } | |
430 | |
431 # Exit at start of Feature table, or start of sequence | |
432 last if( /^(FEATURES|ORIGIN)/ ); | |
433 # Get next line and loop again | |
434 $buffer = $self->_readline; | |
435 } | |
436 return undef if(! defined($buffer)); | |
437 | |
438 # add them all at once for efficiency | |
439 $builder->add_slot_value(-accession_number => shift(@acc), | |
440 -secondary_accessions => \@acc, | |
441 %params); | |
442 $builder->add_slot_value(-annotation => $annotation) if $annotation; | |
443 %params = (); # reset before possible re-use to avoid setting twice | |
444 | |
445 # start over if we don't want to continue with this entry | |
446 if(! $builder->want_object()) { | |
447 $builder->make_object(); | |
448 next RECORDSTART; | |
449 } | |
450 | |
451 # some "minimal" formats may not necessarily have a feature table | |
452 if($builder->want_slot('features') && defined($_) && /^FEATURES/) { | |
453 # need to read the first line of the feature table | |
454 $buffer = $self->_readline; | |
455 | |
456 # DO NOT read lines in the while condition -- this is done as a side | |
457 # effect in _read_FTHelper_GenBank! | |
458 while( defined($buffer) ) { | |
459 # check immediately -- not at the end of the loop | |
460 # note: GenPept entries obviously do not have a BASE line | |
461 last if(($buffer =~ /^BASE/) || ($buffer =~ /^ORIGIN/) || | |
462 ($buffer =~ /^CONTIG/) ); | |
463 | |
464 # slurp in one feature at a time -- at return, the start of | |
465 # the next feature will have been read already, so we need | |
466 # to pass a reference, and the called method must set this | |
467 # to the last line read before returning | |
468 | |
469 my $ftunit = $self->_read_FTHelper_GenBank(\$buffer); | |
470 | |
471 # fix suggested by James Diggans | |
472 | |
473 if( !defined $ftunit ) { | |
474 # GRRRR. We have fallen over. Try to recover | |
475 $self->warn("Unexpected error in feature table for ".$params{'-display_id'}." Skipping feature, attempting to recover"); | |
476 unless( ($buffer =~ /^\s{5,5}\S+/) or ($buffer =~ /^\S+/)) { | |
477 $buffer = $self->_readline(); | |
478 } | |
479 next; # back to reading FTHelpers | |
480 } | |
481 | |
482 # process ftunit | |
483 my $feat = | |
484 $ftunit->_generic_seqfeature($self->location_factory(), | |
485 $display_id); | |
486 # add taxon_id from source if available | |
487 if($species && ($feat->primary_tag eq 'source') && | |
488 $feat->has_tag('db_xref') && (! $species->ncbi_taxid())) { | |
489 foreach my $tagval ($feat->get_tag_values('db_xref')) { | |
490 if(index($tagval,"taxon:") == 0) { | |
491 $species->ncbi_taxid(substr($tagval,6)); | |
492 } | |
493 } | |
494 } | |
495 # add feature to list of features | |
496 push(@features, $feat); | |
497 } | |
498 $builder->add_slot_value(-features => \@features); | |
499 $_ = $buffer; | |
500 } | |
501 if( defined ($_) ) { | |
502 if( /^CONTIG/ && $builder->want_slot('features')) { | |
503 $b = " $_"; # need 5 spaces to treat it like a feature | |
504 my $ftunit = $self->_read_FTHelper_GenBank(\$b); | |
505 if( ! defined $ftunit ) { | |
506 $self->warn("unable to parse the CONTIG feature\n"); | |
507 } else { | |
508 push(@features, | |
509 $ftunit->_generic_seqfeature($self->location_factory(), | |
510 $display_id)); | |
511 } | |
512 } elsif(! /^(ORIGIN|\/\/)/ ) { # advance to the sequence, if any | |
513 while (defined( $_ = $self->_readline) ) { | |
514 last if /^(ORIGIN|\/\/)/; | |
515 } | |
516 } | |
517 } | |
518 if(! $builder->want_object()) { | |
519 $builder->make_object(); # implicit end-of-object | |
520 next RECORDSTART; | |
521 } | |
522 if($builder->want_slot('seq')) { | |
523 # the fact that we want a sequence does not necessarily mean that | |
524 # there also is a sequence ... | |
525 if(defined($_) && s/^ORIGIN//) { | |
526 chomp; | |
527 if( $annotation && length($_) > 0 ) { | |
528 $annotation->add_Annotation('origin', | |
529 Bio::Annotation::SimpleValue->new(-value => $_)); | |
530 } | |
531 my $seqc = ''; | |
532 while( defined($_ = $self->_readline) ) { | |
533 /^\/\// && last; | |
534 $_ = uc($_); | |
535 s/[^A-Za-z]//g; | |
536 $seqc .= $_; | |
537 } | |
538 $self->debug("sequence length is ". length($seqc) ."\n"); | |
539 $builder->add_slot_value(-seq => $seqc); | |
540 } | |
541 } elsif ( defined($_) && (substr($_,0,2) ne '//')) { | |
542 # advance to the end of the record | |
543 while( defined($_ = $self->_readline) ) { | |
544 last if substr($_,0,2) eq '//'; | |
545 } | |
546 } | |
547 # Unlikely, but maybe the sequence is so weird that we don't want it | |
548 # anymore. We don't want to return undef if the stream's not exhausted | |
549 # yet. | |
550 $seq = $builder->make_object(); | |
551 next RECORDSTART unless $seq; | |
552 last RECORDSTART; | |
553 } # end while RECORDSTART | |
554 | |
555 return $seq; | |
556 } | |
557 | |
558 =head2 write_seq | |
559 | |
560 Title : write_seq | |
561 Usage : $stream->write_seq($seq) | |
562 Function: writes the $seq object (must be seq) to the stream | |
563 Returns : 1 for success and 0 for error | |
564 Args : array of 1 to n Bio::SeqI objects | |
565 | |
566 | |
567 =cut | |
568 | |
569 sub write_seq { | |
570 my ($self,@seqs) = @_; | |
571 | |
572 foreach my $seq ( @seqs ) { | |
573 $self->throw("Attempting to write with no seq!") unless defined $seq; | |
574 | |
575 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { | |
576 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); | |
577 } | |
578 | |
579 my $str = $seq->seq; | |
580 | |
581 my ($div, $mol); | |
582 my $len = $seq->length(); | |
583 | |
584 if ( $seq->can('division') ) { | |
585 $div=$seq->division; | |
586 } | |
587 if( !defined $div || ! $div ) { $div = 'UNK'; } | |
588 my $alpha = $seq->alphabet; | |
589 if( !$seq->can('molecule') || ! defined ($mol = $seq->molecule()) ) { | |
590 $mol = $alpha || 'DNA'; | |
591 } | |
592 | |
593 my $circular = 'linear '; | |
594 $circular = 'circular' if $seq->is_circular; | |
595 | |
596 local($^W) = 0; # supressing warnings about uninitialized fields. | |
597 | |
598 my $temp_line; | |
599 if( $self->_id_generation_func ) { | |
600 $temp_line = &{$self->_id_generation_func}($seq); | |
601 } else { | |
602 my $date = ''; | |
603 if( $seq->can('get_dates') ) { | |
604 ($date) = $seq->get_dates(); | |
605 } | |
606 $temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s", | |
607 'LOCUS', $seq->id(),$len, | |
608 (lc($alpha) eq 'protein') ? ('aa','', '') : | |
609 ('bp', '',$mol),$circular, | |
610 $div,$date); | |
611 } | |
612 | |
613 $self->_print("$temp_line\n"); | |
614 $self->_write_line_GenBank_regex("DEFINITION ", " ", | |
615 $seq->desc(),"\\s\+\|\$",80); | |
616 | |
617 # if there, write the accession line | |
618 | |
619 if( $self->_ac_generation_func ) { | |
620 $temp_line = &{$self->_ac_generation_func}($seq); | |
621 $self->_print("ACCESSION $temp_line\n"); | |
622 } else { | |
623 my @acc = (); | |
624 push(@acc, $seq->accession_number()); | |
625 if( $seq->isa('Bio::Seq::RichSeqI') ) { | |
626 push(@acc, $seq->get_secondary_accessions()); | |
627 } | |
628 $self->_print("ACCESSION ", join(" ", @acc), "\n"); | |
629 # otherwise - cannot print <sigh> | |
630 } | |
631 | |
632 # if PID defined, print it | |
633 if($seq->isa('Bio::Seq::RichSeqI') && $seq->pid()) { | |
634 $self->_print("PID ", $seq->pid(), "\n"); | |
635 } | |
636 | |
637 # if there, write the version line | |
638 | |
639 if( defined $self->_sv_generation_func() ) { | |
640 $temp_line = &{$self->_sv_generation_func}($seq); | |
641 if( $temp_line ) { | |
642 $self->_print("VERSION $temp_line\n"); | |
643 } | |
644 } else { | |
645 if($seq->isa('Bio::Seq::RichSeqI') && defined($seq->seq_version)) { | |
646 my $id = $seq->primary_id(); # this may be a GI number | |
647 $self->_print("VERSION ", | |
648 $seq->accession_number(), ".", $seq->seq_version, | |
649 ($id && ($id =~ /^\d+$/) ? " GI:".$id : ""), | |
650 "\n"); | |
651 } | |
652 } | |
653 | |
654 # if there, write the keywords line | |
655 | |
656 if( defined $self->_kw_generation_func() ) { | |
657 $temp_line = &{$self->_kw_generation_func}($seq); | |
658 $self->_print("KEYWORDS $temp_line\n"); | |
659 } else { | |
660 if( $seq->can('keywords') ) { | |
661 my $kw = $seq->keywords; | |
662 if( ref($kw) =~ /ARRAY/i ) { | |
663 $kw = join("; ", @$kw); | |
664 } | |
665 $kw .= '.' if( $kw !~ /\.$/ ); | |
666 $self->_print("KEYWORDS $kw\n"); | |
667 } | |
668 } | |
669 | |
670 # SEGMENT if it exists | |
671 foreach my $ref ( $seq->annotation->get_Annotations('segment') ) { | |
672 $self->_print(sprintf ("%-11s %s\n",'SEGMENT', | |
673 $ref->value)); | |
674 } | |
675 | |
676 # Organism lines | |
677 if (my $spec = $seq->species) { | |
678 my ($species, $genus, @class) = $spec->classification(); | |
679 my $OS; | |
680 if( $spec->common_name ) { | |
681 $OS = $spec->common_name; | |
682 } else { | |
683 $OS = "$genus $species"; | |
684 } | |
685 if (my $ssp = $spec->sub_species) { | |
686 $OS .= " $ssp"; | |
687 } | |
688 $self->_print("SOURCE $OS\n"); | |
689 $self->_print(" ORGANISM ", | |
690 ($spec->organelle() ? $spec->organelle()." " : ""), | |
691 "$genus $species", "\n"); | |
692 my $OC = join('; ', (reverse(@class), $genus)) .'.'; | |
693 $self->_write_line_GenBank_regex(' 'x12,' 'x12, | |
694 $OC,"\\s\+\|\$",80); | |
695 } | |
696 | |
697 # Reference lines | |
698 my $count = 1; | |
699 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { | |
700 $temp_line = sprintf ("REFERENCE $count (%s %d to %d)", | |
701 ($seq->alphabet() eq "protein" ? | |
702 "residues" : "bases"), | |
703 $ref->start,$ref->end); | |
704 $self->_print("$temp_line\n"); | |
705 $self->_write_line_GenBank_regex(" AUTHORS ",' 'x12, | |
706 $ref->authors,"\\s\+\|\$",80); | |
707 $self->_write_line_GenBank_regex(" TITLE "," "x12, | |
708 $ref->title,"\\s\+\|\$",80); | |
709 $self->_write_line_GenBank_regex(" JOURNAL "," "x12, | |
710 $ref->location,"\\s\+\|\$",80); | |
711 if ($ref->comment) { | |
712 $self->_write_line_GenBank_regex(" REMARK "," "x12, | |
713 $ref->comment,"\\s\+\|\$",80); | |
714 } | |
715 if( $ref->medline) { | |
716 $self->_write_line_GenBank_regex(" MEDLINE "," "x12, | |
717 $ref->medline, "\\s\+\|\$",80); | |
718 # I am assuming that pubmed entries only exist when there | |
719 # are also MEDLINE entries due to the indentation | |
720 # This could be a wrong assumption | |
721 if( $ref->pubmed ) { | |
722 $self->_write_line_GenBank_regex(" PUBMED "," "x12, | |
723 $ref->pubmed, "\\s\+\|\$", | |
724 80); | |
725 } | |
726 } | |
727 $count++; | |
728 } | |
729 # Comment lines | |
730 | |
731 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { | |
732 $self->_write_line_GenBank_regex("COMMENT "," "x12, | |
733 $comment->text,"\\s\+\|\$",80); | |
734 } | |
735 $self->_print("FEATURES Location/Qualifiers\n"); | |
736 | |
737 my $contig; | |
738 if( defined $self->_post_sort ) { | |
739 # we need to read things into an array. Process. Sort them. Print 'em | |
740 | |
741 my $post_sort_func = $self->_post_sort(); | |
742 my @fth; | |
743 | |
744 foreach my $sf ( $seq->top_SeqFeatures ) { | |
745 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq)); | |
746 } | |
747 | |
748 @fth = sort { &$post_sort_func($a,$b) } @fth; | |
749 | |
750 foreach my $fth ( @fth ) { | |
751 $self->_print_GenBank_FTHelper($fth); | |
752 } | |
753 } else { | |
754 # not post sorted. And so we can print as we get them. | |
755 # lower memory load... | |
756 | |
757 foreach my $sf ( $seq->top_SeqFeatures ) { | |
758 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq); | |
759 foreach my $fth ( @fth ) { | |
760 if( ! $fth->isa('Bio::SeqIO::FTHelper') ) { | |
761 $sf->throw("Cannot process FTHelper... $fth"); | |
762 } | |
763 $self->_print_GenBank_FTHelper($fth); | |
764 } | |
765 } | |
766 } | |
767 if( $seq->length == 0 ) { $self->_show_dna(0) } | |
768 | |
769 if( $self->_show_dna() == 0 ) { | |
770 $self->_print("\n//\n"); | |
771 return; | |
772 } | |
773 | |
774 # finished printing features. | |
775 | |
776 $str =~ tr/A-Z/a-z/; | |
777 | |
778 # Count each nucleotide | |
779 unless( $mol eq 'protein' ) { | |
780 my $alen = $str =~ tr/a/a/; | |
781 my $clen = $str =~ tr/c/c/; | |
782 my $glen = $str =~ tr/g/g/; | |
783 my $tlen = $str =~ tr/t/t/; | |
784 | |
785 my $olen = $len - ($alen + $tlen + $clen + $glen); | |
786 if( $olen < 0 ) { | |
787 $self->warn("Weird. More atgc than bases. Problem!"); | |
788 } | |
789 | |
790 my $base_count = sprintf("BASE COUNT %8s a %6s c %6s g %6s t%s\n", | |
791 $alen,$clen,$glen,$tlen, | |
792 ( $olen > 0 ) ? sprintf("%6s others",$olen) : ''); | |
793 $self->_print($base_count); | |
794 } | |
795 my ($o) = $seq->annotation->get_Annotations('origin'); | |
796 $self->_print(sprintf("%-6s%s\n",'ORIGIN',$o ? $o->value : '')); | |
797 | |
798 # print out the sequence | |
799 my $nuc = 60; # Number of nucleotides per line | |
800 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line | |
801 my $out_pat = 'A11' x 6; # Pattern for packing a line | |
802 my $length = length($str); | |
803 | |
804 # Calculate the number of nucleotides which fit on whole lines | |
805 my $whole = int($length / $nuc) * $nuc; | |
806 | |
807 # Print the whole lines | |
808 my $i; | |
809 for ($i = 0; $i < $whole; $i += $nuc) { | |
810 my $blocks = pack $out_pat, | |
811 unpack $whole_pat, | |
812 substr($str, $i, $nuc); | |
813 chop $blocks; | |
814 $self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59)); | |
815 } | |
816 | |
817 # Print the last line | |
818 if (my $last = substr($str, $i)) { | |
819 my $last_len = length($last); | |
820 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10; | |
821 my $blocks = pack $out_pat, | |
822 unpack($last_pat, $last); | |
823 $blocks =~ s/ +$//; | |
824 $self->_print(sprintf("%9d $blocks\n", $length - $last_len + 1)); | |
825 } | |
826 | |
827 $self->_print("//\n"); | |
828 | |
829 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
830 return 1; | |
831 } | |
832 } | |
833 | |
834 =head2 _print_GenBank_FTHelper | |
835 | |
836 Title : _print_GenBank_FTHelper | |
837 Usage : | |
838 Function: | |
839 Example : | |
840 Returns : | |
841 Args : | |
842 | |
843 | |
844 =cut | |
845 | |
846 sub _print_GenBank_FTHelper { | |
847 my ($self,$fth,$always_quote) = @_; | |
848 | |
849 if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) { | |
850 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!"); | |
851 } | |
852 if( defined $fth->key && | |
853 $fth->key eq 'CONTIG' ) { | |
854 $self->_write_line_GenBank_regex(sprintf("%-12s",$fth->key), | |
855 ' 'x12,$fth->loc,"\,\|\$",80); | |
856 } else { | |
857 $self->_write_line_GenBank_regex(sprintf(" %-16s",$fth->key), | |
858 " "x21, | |
859 $fth->loc,"\,\|\$",80); | |
860 } | |
861 | |
862 if( !defined $always_quote) { $always_quote = 0; } | |
863 | |
864 foreach my $tag ( keys %{$fth->field} ) { | |
865 foreach my $value ( @{$fth->field->{$tag}} ) { | |
866 $value =~ s/\"/\"\"/g; | |
867 if ($value eq "_no_value") { | |
868 $self->_write_line_GenBank_regex(" "x21, | |
869 " "x21, | |
870 "/$tag","\.\|\$",80); | |
871 } | |
872 elsif( $always_quote == 1 || $value !~ /^\d+$/ ) { | |
873 my ($pat) = ($value =~ /\s/ ? '\s|$' : '.|$'); | |
874 $self->_write_line_GenBank_regex(" "x21, | |
875 " "x21, | |
876 "/$tag=\"$value\"",$pat,80); | |
877 } else { | |
878 $self->_write_line_GenBank_regex(" "x21, | |
879 " "x21, | |
880 "/$tag=$value","\.\|\$",80); | |
881 } | |
882 } | |
883 } | |
884 | |
885 } | |
886 | |
887 | |
888 =head2 _read_GenBank_References | |
889 | |
890 Title : _read_GenBank_References | |
891 Usage : | |
892 Function: Reads references from GenBank format. Internal function really | |
893 Returns : | |
894 Args : | |
895 | |
896 | |
897 =cut | |
898 | |
899 sub _read_GenBank_References{ | |
900 my ($self,$buffer) = @_; | |
901 my (@refs); | |
902 my $ref; | |
903 | |
904 # assumme things are starting with RN | |
905 | |
906 if( $$buffer !~ /^REFERENCE/ ) { | |
907 warn("Not parsing line '$$buffer' which maybe important"); | |
908 } | |
909 | |
910 $_ = $$buffer; | |
911 | |
912 my (@title,@loc,@authors,@com,@medline,@pubmed); | |
913 | |
914 REFLOOP: while( defined($_) || defined($_ = $self->_readline) ) { | |
915 if (/^ AUTHORS\s+(.*)/) { | |
916 push (@authors, $1); | |
917 while ( defined($_ = $self->_readline) ) { | |
918 /^\s{3,}(.*)/ && do { push (@authors, $1);next;}; | |
919 last; | |
920 } | |
921 $ref->authors(join(' ', @authors)); | |
922 } | |
923 if (/^ TITLE\s+(.*)/) { | |
924 push (@title, $1); | |
925 while ( defined($_ = $self->_readline) ) { | |
926 /^\s{3,}(.*)/ && do { push (@title, $1); | |
927 next; | |
928 }; | |
929 last; | |
930 } | |
931 $ref->title(join(' ', @title)); | |
932 } | |
933 if (/^ JOURNAL\s+(.*)/) { | |
934 push(@loc, $1); | |
935 while ( defined($_ = $self->_readline) ) { | |
936 /^\s{3,}(.*)/ && do { push(@loc, $1); | |
937 next; | |
938 }; | |
939 last; | |
940 } | |
941 $ref->location(join(' ', @loc)); | |
942 redo REFLOOP; | |
943 } | |
944 if (/^ REMARK\s+(.*)/) { | |
945 push (@com, $1); | |
946 while ( defined($_ = $self->_readline) ) { | |
947 /^\s{3,}(.*)/ && do { push(@com, $1); | |
948 next; | |
949 }; | |
950 last; | |
951 } | |
952 $ref->comment(join(' ', @com)); | |
953 redo REFLOOP; | |
954 } | |
955 if( /^ MEDLINE\s+(.*)/ ) { | |
956 push(@medline,$1); | |
957 while ( defined($_ = $self->_readline) ) { | |
958 /^\s{4,}(.*)/ && do { push(@medline, $1); | |
959 next; | |
960 }; | |
961 last; | |
962 } | |
963 $ref->medline(join(' ', @medline)); | |
964 redo REFLOOP; | |
965 } | |
966 if( /^ PUBMED\s+(.*)/ ) { | |
967 push(@pubmed,$1); | |
968 while ( defined($_ = $self->_readline) ) { | |
969 /^\s{5,}(.*)/ && do { push(@pubmed, $1); | |
970 next; | |
971 }; | |
972 last; | |
973 } | |
974 $ref->pubmed(join(' ', @pubmed)); | |
975 redo REFLOOP; | |
976 } | |
977 | |
978 /^REFERENCE/ && do { | |
979 # store current reference | |
980 $self->_add_ref_to_array(\@refs,$ref) if $ref; | |
981 # reset | |
982 @authors = (); | |
983 @title = (); | |
984 @loc = (); | |
985 @com = (); | |
986 @pubmed = (); | |
987 @medline = (); | |
988 # create the new reference object | |
989 $ref = Bio::Annotation::Reference->new(); | |
990 # check whether start and end base is given | |
991 if (/^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)/){ | |
992 $ref->start($1); | |
993 $ref->end($2); | |
994 } | |
995 }; | |
996 | |
997 /^(FEATURES)|(COMMENT)/ && last; | |
998 | |
999 $_ = undef; # Empty $_ to trigger read of next line | |
1000 } | |
1001 | |
1002 # store last reference | |
1003 $self->_add_ref_to_array(\@refs,$ref) if $ref; | |
1004 | |
1005 $$buffer = $_; | |
1006 | |
1007 #print "\nnumber of references found: ", $#refs+1,"\n"; | |
1008 | |
1009 return @refs; | |
1010 } | |
1011 | |
1012 # | |
1013 # This is undocumented as it shouldn't be called by anywhere else as | |
1014 # read_GenBank_References. For those who still want to know: | |
1015 # | |
1016 # Purpose: adds a Reference object to an array of Reference objects, takes | |
1017 # care of possible cleanups to be done (currently, only author and title | |
1018 # will be chopped of trailing semicolons). | |
1019 # Parameters: | |
1020 # a reference to an array of Reference objects | |
1021 # the Reference object to be added | |
1022 # Returns: nothing | |
1023 # | |
1024 sub _add_ref_to_array { | |
1025 my ($self, $refs, $ref) = @_; | |
1026 | |
1027 # first, polish author and title by removing possible trailing semicolons | |
1028 my $au = $ref->authors(); | |
1029 my $title = $ref->title(); | |
1030 $au =~ s/;\s*$//g if $au; | |
1031 $title =~ s/;\s*$//g if $title; | |
1032 $ref->authors($au); | |
1033 $ref->title($title); | |
1034 # the rest should be clean already, so go ahead and add it | |
1035 push(@{$refs}, $ref); | |
1036 } | |
1037 | |
1038 =head2 _read_GenBank_Species | |
1039 | |
1040 Title : _read_GenBank_Species | |
1041 Usage : | |
1042 Function: Reads the GenBank Organism species and classification | |
1043 lines. | |
1044 Example : | |
1045 Returns : A Bio::Species object | |
1046 Args : a reference to the current line buffer | |
1047 | |
1048 =cut | |
1049 | |
1050 sub _read_GenBank_Species { | |
1051 my( $self,$buffer) = @_; | |
1052 my @organell_names = ("chloroplast", "mitochondr"); | |
1053 # only those carrying DNA, apart from the nucleus | |
1054 | |
1055 $_ = $$buffer; | |
1056 | |
1057 my( $sub_species, $species, $genus, $common, $organelle, @class ); | |
1058 # upon first entering the loop, we must not read a new line -- the SOURCE | |
1059 # line is already in the buffer (HL 05/10/2000) | |
1060 while (defined($_) || defined($_ = $self->_readline())) { | |
1061 # de-HTMLify (links that may be encountered here don't contain | |
1062 # escaped '>', so a simple-minded approach suffices) | |
1063 s/<[^>]+>//g; | |
1064 if (/^SOURCE\s+(.*)/) { | |
1065 # FIXME this is probably mostly wrong (e.g., it yields things like | |
1066 # Homo sapiens adult placenta cDNA to mRNA | |
1067 # which is certainly not what you want) | |
1068 $common = $1; | |
1069 $common =~ s/\.$//; # remove trailing dot | |
1070 } elsif (/^\s+ORGANISM/) { | |
1071 my @spflds = split(' ', $_); | |
1072 shift(@spflds); # ORGANISM | |
1073 if(grep { $_ =~ /^$spflds[0]/i; } @organell_names) { | |
1074 $organelle = shift(@spflds); | |
1075 } | |
1076 $genus = shift(@spflds); | |
1077 if(@spflds) { | |
1078 $species = shift(@spflds); | |
1079 } else { | |
1080 $species = "sp."; | |
1081 } | |
1082 $sub_species = shift(@spflds) if(@spflds); | |
1083 } elsif (/^\s+(.+)/) { | |
1084 # only split on ';' or '.' so that | |
1085 # classification that is 2 words will | |
1086 # still get matched | |
1087 # use map to remove trailing/leading spaces | |
1088 push(@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/, $1); | |
1089 } else { | |
1090 last; | |
1091 } | |
1092 | |
1093 $_ = undef; # Empty $_ to trigger read of next line | |
1094 } | |
1095 | |
1096 $$buffer = $_; | |
1097 | |
1098 # Don't make a species object if it's empty or "Unknown" or "None" | |
1099 return unless $genus and $genus !~ /^(Unknown|None)$/i; | |
1100 | |
1101 # Bio::Species array needs array in Species -> Kingdom direction | |
1102 if ($class[$#class] eq $genus) { | |
1103 push( @class, $species ); | |
1104 } else { | |
1105 push( @class, $genus, $species ); | |
1106 } | |
1107 @class = reverse @class; | |
1108 | |
1109 my $make = Bio::Species->new(); | |
1110 $make->classification( \@class, "FORCE" ); # no name validation please | |
1111 $make->common_name( $common ) if $common; | |
1112 $make->sub_species( $sub_species ) if $sub_species; | |
1113 $make->organelle($organelle) if $organelle; | |
1114 return $make; | |
1115 } | |
1116 | |
1117 =head2 _read_FTHelper_GenBank | |
1118 | |
1119 Title : _read_FTHelper_GenBank | |
1120 Usage : _read_FTHelper_GenBank($buffer) | |
1121 Function: reads the next FT key line | |
1122 Example : | |
1123 Returns : Bio::SeqIO::FTHelper object | |
1124 Args : filehandle and reference to a scalar | |
1125 | |
1126 | |
1127 =cut | |
1128 | |
1129 sub _read_FTHelper_GenBank { | |
1130 my ($self,$buffer) = @_; | |
1131 | |
1132 my ($key, # The key of the feature | |
1133 $loc # The location line from the feature | |
1134 ); | |
1135 my @qual = (); # An arrray of lines making up the qualifiers | |
1136 | |
1137 if ($$buffer =~ /^ (\S+)\s+(.+?)\s*$/o) { | |
1138 $key = $1; | |
1139 $loc = $2; | |
1140 # Read all the lines up to the next feature | |
1141 while ( defined($_ = $self->_readline) ) { | |
1142 if (/^(\s+)(.+?)\s*$/o) { | |
1143 # Lines inside features are preceded by 21 spaces | |
1144 # A new feature is preceded by 5 spaces | |
1145 if (length($1) > 6) { | |
1146 # Add to qualifiers if we're in the qualifiers, or if it's | |
1147 # the first qualifier | |
1148 if (@qual || (index($2,'/') == 0)) { | |
1149 push(@qual, $2); | |
1150 } | |
1151 # We're still in the location line, so append to location | |
1152 else { | |
1153 $loc .= $2; | |
1154 } | |
1155 } else { | |
1156 # We've reached the start of the next feature | |
1157 last; | |
1158 } | |
1159 } else { | |
1160 # We're at the end of the feature table | |
1161 last; | |
1162 } | |
1163 } | |
1164 } else { | |
1165 # No feature key | |
1166 $self->debug("no feature key!\n"); | |
1167 # change suggested by JDiggans to avoid infinite loop- | |
1168 # see bugreport 1062. | |
1169 # reset buffer to prevent infinite loop | |
1170 $$buffer = $self->_readline(); | |
1171 return; | |
1172 } | |
1173 | |
1174 # Put the first line of the next feature into the buffer | |
1175 $$buffer = $_; | |
1176 | |
1177 # Make the new FTHelper object | |
1178 my $out = new Bio::SeqIO::FTHelper(); | |
1179 $out->verbose($self->verbose()); | |
1180 $out->key($key); | |
1181 $out->loc($loc); | |
1182 | |
1183 # Now parse and add any qualifiers. (@qual is kept | |
1184 # intact to provide informative error messages.) | |
1185 QUAL: for (my $i = 0; $i < @qual; $i++) { | |
1186 $_ = $qual[$i]; | |
1187 my( $qualifier, $value ) = (m{^/([^=]+)(?:=(.+))?}) | |
1188 or $self->warn("cannot see new qualifier in feature $key: ". | |
1189 $qual[$i]); | |
1190 #or $self->throw("Can't see new qualifier in: $_\nfrom:\n" | |
1191 # . join('', map "$_\n", @qual)); | |
1192 $qualifier = '' unless( defined $qualifier); | |
1193 if (defined $value) { | |
1194 # Do we have a quoted value? | |
1195 if (substr($value, 0, 1) eq '"') { | |
1196 # Keep adding to value until we find the trailing quote | |
1197 # and the quotes are balanced | |
1198 while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) { | |
1199 if($i >= $#qual) { | |
1200 $self->warn("Unbalanced quote in:\n" . | |
1201 join('', map("$_\n", @qual)) . | |
1202 "No further qualifiers will " . | |
1203 "be added for this feature"); | |
1204 last QUAL; | |
1205 } | |
1206 $i++; # modifying a for-loop variable inside of the loop | |
1207 # is not the best programming style ... | |
1208 my $next = $qual[$i]; | |
1209 | |
1210 # add to value with a space unless the value appears | |
1211 # to be a sequence (translation for example) | |
1212 if(($value.$next) =~ /[^A-Za-z"-]/) { | |
1213 $value .= " "; | |
1214 } | |
1215 $value .= $next; | |
1216 } | |
1217 # Trim leading and trailing quotes | |
1218 $value =~ s/^"|"$//g; | |
1219 # Undouble internal quotes | |
1220 $value =~ s/""/\"/g; | |
1221 } | |
1222 } else { | |
1223 $value = '_no_value'; | |
1224 } | |
1225 # Store the qualifier | |
1226 $out->field->{$qualifier} ||= []; | |
1227 push(@{$out->field->{$qualifier}},$value); | |
1228 } | |
1229 return $out; | |
1230 } | |
1231 | |
1232 =head2 _write_line_GenBank | |
1233 | |
1234 Title : _write_line_GenBank | |
1235 Usage : | |
1236 Function: internal function | |
1237 Example : | |
1238 Returns : | |
1239 Args : | |
1240 | |
1241 | |
1242 =cut | |
1243 | |
1244 sub _write_line_GenBank{ | |
1245 my ($self,$pre1,$pre2,$line,$length) = @_; | |
1246 | |
1247 $length || $self->throw("Miscalled write_line_GenBank without length. Programming error!"); | |
1248 my $subl = $length - length $pre2; | |
1249 my $linel = length $line; | |
1250 my $i; | |
1251 | |
1252 my $sub = substr($line,0,$length - length $pre1); | |
1253 | |
1254 $self->_print("$pre1$sub\n"); | |
1255 | |
1256 for($i= ($length - length $pre1);$i < $linel;) { | |
1257 $sub = substr($line,$i,($subl)); | |
1258 $self->_print("$pre2$sub\n"); | |
1259 $i += $subl; | |
1260 } | |
1261 | |
1262 } | |
1263 | |
1264 =head2 _write_line_GenBank_regex | |
1265 | |
1266 Title : _write_line_GenBank_regex | |
1267 Usage : | |
1268 Function: internal function for writing lines of specified | |
1269 length, with different first and the next line | |
1270 left hand headers and split at specific points in the | |
1271 text | |
1272 Example : | |
1273 Returns : nothing | |
1274 Args : file handle, first header, second header, text-line, regex for line breaks, total line length | |
1275 | |
1276 | |
1277 =cut | |
1278 | |
1279 sub _write_line_GenBank_regex { | |
1280 my ($self,$pre1,$pre2,$line,$regex,$length) = @_; | |
1281 | |
1282 #print STDOUT "Going to print with $line!\n"; | |
1283 | |
1284 $length || $self->throw( "Miscalled write_line_GenBank without length. Programming error!"); | |
1285 | |
1286 # if( length $pre1 != length $pre2 ) { | |
1287 # $self->throw( "Programming error - cannot called write_line_GenBank_regex with different length pre1 and pre2 tags!"); | |
1288 # } | |
1289 | |
1290 my $subl = $length - (length $pre1) - 2; | |
1291 my @lines = (); | |
1292 | |
1293 CHUNK: while($line) { | |
1294 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) { | |
1295 if($line =~ m/^(.{1,$subl})($pat)(.*)/) { | |
1296 $line = $3; | |
1297 # be strict about not padding spaces according to | |
1298 # genbank format | |
1299 my $l = $1.$2; | |
1300 $l =~ s/\s+$//; | |
1301 push(@lines, $l); | |
1302 next CHUNK; | |
1303 } | |
1304 } | |
1305 # if we get here none of the patterns matched $subl or less chars | |
1306 $self->warn("trouble dissecting \"$line\" into chunks ". | |
1307 "of $subl chars or less - this tag won't print right"); | |
1308 # insert a space char to prevent infinite loops | |
1309 $line = substr($line,0,$subl) . " " . substr($line,$subl); | |
1310 } | |
1311 | |
1312 my $s = shift @lines; | |
1313 $self->_print("$pre1$s\n"); | |
1314 foreach my $s ( @lines ) { | |
1315 $self->_print("$pre2$s\n"); | |
1316 } | |
1317 } | |
1318 | |
1319 =head2 _post_sort | |
1320 | |
1321 Title : _post_sort | |
1322 Usage : $obj->_post_sort($newval) | |
1323 Function: | |
1324 Returns : value of _post_sort | |
1325 Args : newvalue (optional) | |
1326 | |
1327 | |
1328 =cut | |
1329 | |
1330 sub _post_sort{ | |
1331 my ($obj,$value) = @_; | |
1332 if( defined $value) { | |
1333 $obj->{'_post_sort'} = $value; | |
1334 } | |
1335 return $obj->{'_post_sort'}; | |
1336 } | |
1337 | |
1338 =head2 _show_dna | |
1339 | |
1340 Title : _show_dna | |
1341 Usage : $obj->_show_dna($newval) | |
1342 Function: | |
1343 Returns : value of _show_dna | |
1344 Args : newvalue (optional) | |
1345 | |
1346 | |
1347 =cut | |
1348 | |
1349 sub _show_dna{ | |
1350 my ($obj,$value) = @_; | |
1351 if( defined $value) { | |
1352 $obj->{'_show_dna'} = $value; | |
1353 } | |
1354 return $obj->{'_show_dna'}; | |
1355 } | |
1356 | |
1357 =head2 _id_generation_func | |
1358 | |
1359 Title : _id_generation_func | |
1360 Usage : $obj->_id_generation_func($newval) | |
1361 Function: | |
1362 Returns : value of _id_generation_func | |
1363 Args : newvalue (optional) | |
1364 | |
1365 | |
1366 =cut | |
1367 | |
1368 sub _id_generation_func{ | |
1369 my ($obj,$value) = @_; | |
1370 if( defined $value ) { | |
1371 $obj->{'_id_generation_func'} = $value; | |
1372 } | |
1373 return $obj->{'_id_generation_func'}; | |
1374 } | |
1375 | |
1376 =head2 _ac_generation_func | |
1377 | |
1378 Title : _ac_generation_func | |
1379 Usage : $obj->_ac_generation_func($newval) | |
1380 Function: | |
1381 Returns : value of _ac_generation_func | |
1382 Args : newvalue (optional) | |
1383 | |
1384 | |
1385 =cut | |
1386 | |
1387 sub _ac_generation_func{ | |
1388 my ($obj,$value) = @_; | |
1389 if( defined $value ) { | |
1390 $obj->{'_ac_generation_func'} = $value; | |
1391 } | |
1392 return $obj->{'_ac_generation_func'}; | |
1393 } | |
1394 | |
1395 =head2 _sv_generation_func | |
1396 | |
1397 Title : _sv_generation_func | |
1398 Usage : $obj->_sv_generation_func($newval) | |
1399 Function: | |
1400 Returns : value of _sv_generation_func | |
1401 Args : newvalue (optional) | |
1402 | |
1403 | |
1404 =cut | |
1405 | |
1406 sub _sv_generation_func{ | |
1407 my ($obj,$value) = @_; | |
1408 if( defined $value ) { | |
1409 $obj->{'_sv_generation_func'} = $value; | |
1410 } | |
1411 return $obj->{'_sv_generation_func'}; | |
1412 | |
1413 } | |
1414 | |
1415 =head2 _kw_generation_func | |
1416 | |
1417 Title : _kw_generation_func | |
1418 Usage : $obj->_kw_generation_func($newval) | |
1419 Function: | |
1420 Returns : value of _kw_generation_func | |
1421 Args : newvalue (optional) | |
1422 | |
1423 | |
1424 =cut | |
1425 | |
1426 sub _kw_generation_func{ | |
1427 my ($obj,$value) = @_; | |
1428 if( defined $value ) { | |
1429 $obj->{'_kw_generation_func'} = $value; | |
1430 } | |
1431 return $obj->{'_kw_generation_func'}; | |
1432 } | |
1433 | |
1434 1; |