Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqIO/genbank.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: genbank.pm,v 1.76.2.12 2003/09/13 23:33:04 jason Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::SeqIO::GenBank | |
| 4 # | |
| 5 # Cared for by Elia Stupka <elia@tll.org.sg> | |
| 6 # | |
| 7 # Copyright Elia Stupka | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::SeqIO::GenBank - GenBank sequence input/output stream | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 It is probably best not to use this object directly, but | |
| 20 rather go through the SeqIO handler system. Go: | |
| 21 | |
| 22 $stream = Bio::SeqIO->new(-file => $filename, -format => 'GenBank'); | |
| 23 | |
| 24 while ( my $seq = $stream->next_seq() ) { | |
| 25 # do something with $seq | |
| 26 } | |
| 27 | |
| 28 | |
| 29 =head1 DESCRIPTION | |
| 30 | |
| 31 This object can transform Bio::Seq objects to and from GenBank flat | |
| 32 file databases. | |
| 33 | |
| 34 There is alot of flexibility here about how to dump things which I need | |
| 35 to document fully. | |
| 36 | |
| 37 =head2 Mapping of record properties to object properties | |
| 38 | |
| 39 This section is supposed to document which sections and properties of | |
| 40 a GenBank databank record end up where in the Bioperl object model. It | |
| 41 is far from complete and presently focuses only on those mappings | |
| 42 which may be non-obvious. $seq in the text refers to the | |
| 43 Bio::Seq::RichSeqI implementing object returned by the parser for each | |
| 44 record. | |
| 45 | |
| 46 =over 4 | |
| 47 | |
| 48 =item GI number | |
| 49 | |
| 50 $seq-E<gt>primary_id | |
| 51 | |
| 52 =back | |
| 53 | |
| 54 =head2 Optional functions | |
| 55 | |
| 56 =over 3 | |
| 57 | |
| 58 =item _show_dna() | |
| 59 | |
| 60 (output only) shows the dna or not | |
| 61 | |
| 62 =item _post_sort() | |
| 63 | |
| 64 (output only) provides a sorting func which is applied to the FTHelpers | |
| 65 before printing | |
| 66 | |
| 67 =item _id_generation_func() | |
| 68 | |
| 69 This is function which is called as | |
| 70 | |
| 71 print "ID ", $func($seq), "\n"; | |
| 72 | |
| 73 To generate the ID line. If it is not there, it generates a sensible ID | |
| 74 line using a number of tools. | |
| 75 | |
| 76 | |
| 77 If you want to output annotations in genbank format they need to be | |
| 78 stored in a Bio::Annotation::Collection object which is accessible | |
| 79 through the Bio::SeqI interface method L<annotation()|annotation>. | |
| 80 | |
| 81 The following are the names of the keys which are polled from a | |
| 82 L<Bio::Annotation::Collection> object. | |
| 83 | |
| 84 reference - Should contain Bio::Annotation::Reference objects | |
| 85 comment - Should contain Bio::Annotation::Comment objects | |
| 86 | |
| 87 segment - Should contain a Bio::Annotation::SimpleValue object | |
| 88 origin - Should contain a Bio::Annotation::SimpleValue object | |
| 89 | |
| 90 =back | |
| 91 | |
| 92 =head1 Where does the data go? | |
| 93 | |
| 94 Data parsed in Bio::SeqIO::genbank is stored in a variety of data | |
| 95 fields in the sequence object that is returned. More information in | |
| 96 the HOWTOs about exactly what each field means and where it goes. | |
| 97 Here is a partial list of fields. | |
| 98 | |
| 99 Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you | |
| 100 the top level object which defines a function called NAME() which | |
| 101 stores this information. | |
| 102 | |
| 103 Items listed as Annotation 'NAME' tell you the data is stored the | |
| 104 associated Bio::Annotation::Colection object which is associated with | |
| 105 Bio::Seq objects. If it is explictly requested that no annotations | |
| 106 should be stored when parsing a record of course they won't be | |
| 107 available when you try and get them. If you are having this problem | |
| 108 look at the type of SeqBuilder that is being used to contruct your | |
| 109 sequence object. | |
| 110 | |
| 111 Comments Annotation 'comment' | |
| 112 References Annotation 'reference' | |
| 113 Segment Annotation 'segment' | |
| 114 Origin Annotation 'origin' | |
| 115 | |
| 116 Accessions PrimarySeq accession_number() | |
| 117 Secondary accessions RichSeq get_secondary_accessions() | |
| 118 Keywords RichSeq keywords() | |
| 119 Dates RichSeq get_dates() | |
| 120 Molecule RichSeq molecule() | |
| 121 Seq Version RichSeq seq_version() | |
| 122 PID RichSeq pid() | |
| 123 Division RichSeq division() | |
| 124 Features Seq get_SeqFeatures() | |
| 125 Alphabet PrimarySeq alphabet() | |
| 126 Definition PrimarySeq description() or desc() | |
| 127 Version PrimarySeq version() | |
| 128 | |
| 129 Sequence PrimarySeq seq() | |
| 130 | |
| 131 =head1 FEEDBACK | |
| 132 | |
| 133 =head2 Mailing Lists | |
| 134 | |
| 135 User feedback is an integral part of the evolution of this | |
| 136 and other Bioperl modules. Send your comments and suggestions preferably | |
| 137 to one of the Bioperl mailing lists. | |
| 138 Your participation is much appreciated. | |
| 139 | |
| 140 bioperl-l@bioperl.org - General discussion | |
| 141 http://www.bioperl.org/MailList.shtml - About the mailing lists | |
| 142 | |
| 143 =head2 Reporting Bugs | |
| 144 | |
| 145 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 146 the bugs and their resolution. | |
| 147 Bug reports can be submitted via email or the web: | |
| 148 | |
| 149 bioperl-bugs@bio.perl.org | |
| 150 http://bugzilla.bioperl.org/ | |
| 151 | |
| 152 =head1 AUTHOR - Elia Stupka | |
| 153 | |
| 154 Email elia@tll.org.sg | |
| 155 | |
| 156 =head1 CONTRIBUTORS | |
| 157 | |
| 158 Ewan Birney birney@ebi.ac.uk | |
| 159 Jason Stajich jason@bioperl.org | |
| 160 Chris Mungall cjm@fruitfly.bdgp.berkeley.edu | |
| 161 Lincoln Stein lstein@cshl.org | |
| 162 Heikki Lehvaslaiho, heikki@ebi.ac.uk | |
| 163 Hilmar Lapp, hlapp@gmx.net | |
| 164 Donald G. Jackson, donald.jackson@bms.com | |
| 165 | |
| 166 =head1 APPENDIX | |
| 167 | |
| 168 The rest of the documentation details each of the object | |
| 169 methods. Internal methods are usually preceded with a _ | |
| 170 | |
| 171 =cut | |
| 172 | |
| 173 # Let the code begin... | |
| 174 | |
| 175 package Bio::SeqIO::genbank; | |
| 176 use vars qw(@ISA); | |
| 177 use strict; | |
| 178 | |
| 179 use Bio::SeqIO; | |
| 180 use Bio::SeqIO::FTHelper; | |
| 181 use Bio::SeqFeature::Generic; | |
| 182 use Bio::Species; | |
| 183 use Bio::Seq::SeqFactory; | |
| 184 use Bio::Annotation::Collection; | |
| 185 use Bio::Annotation::Comment; | |
| 186 use Bio::Annotation::Reference; | |
| 187 use Bio::Annotation::DBLink; | |
| 188 | |
| 189 @ISA = qw(Bio::SeqIO); | |
| 190 | |
| 191 sub _initialize { | |
| 192 my($self,@args) = @_; | |
| 193 | |
| 194 $self->SUPER::_initialize(@args); | |
| 195 # hash for functions for decoding keys. | |
| 196 $self->{'_func_ftunit_hash'} = {}; | |
| 197 $self->_show_dna(1); # sets this to one by default. People can change it | |
| 198 if( ! defined $self->sequence_factory ) { | |
| 199 $self->sequence_factory(new Bio::Seq::SeqFactory | |
| 200 (-verbose => $self->verbose(), | |
| 201 -type => 'Bio::Seq::RichSeq')); | |
| 202 } | |
| 203 } | |
| 204 | |
| 205 =head2 next_seq | |
| 206 | |
| 207 Title : next_seq | |
| 208 Usage : $seq = $stream->next_seq() | |
| 209 Function: returns the next sequence in the stream | |
| 210 Returns : Bio::Seq object | |
| 211 Args : | |
| 212 | |
| 213 =cut | |
| 214 | |
| 215 sub next_seq { | |
| 216 my ($self,@args) = @_; | |
| 217 my $builder = $self->sequence_builder(); | |
| 218 my $seq; | |
| 219 my %params; | |
| 220 | |
| 221 RECORDSTART: while (1) { | |
| 222 my $buffer; | |
| 223 my (@acc, @features); | |
| 224 my ($display_id, $annotation); | |
| 225 my $species; | |
| 226 | |
| 227 # initialize; we may come here because of starting over | |
| 228 @features = (); | |
| 229 $annotation = undef; | |
| 230 @acc = (); | |
| 231 $species = undef; | |
| 232 %params = (-verbose => $self->verbose); # reset hash | |
| 233 local($/) = "\n"; | |
| 234 while(defined($buffer = $self->_readline())) { | |
| 235 last if index($buffer,'LOCUS ') == 0; | |
| 236 } | |
| 237 return undef if( !defined $buffer ); # end of file | |
| 238 $buffer =~ /^LOCUS\s+(\S.*)$/ || | |
| 239 $self->throw("GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'"); | |
| 240 | |
| 241 my @tokens = split(' ', $1); | |
| 242 | |
| 243 # this is important to have the id for display in e.g. FTHelper, | |
| 244 # otherwise you won't know which entry caused an error | |
| 245 $display_id = shift(@tokens); | |
| 246 $params{'-display_id'} = $display_id; | |
| 247 # may still be useful if we don't want the seq | |
| 248 $params{'-length'} = shift(@tokens); | |
| 249 # the alphabet of the entry | |
| 250 $params{'-alphabet'} = (lc(shift @tokens) eq 'bp') ? 'dna' : 'protein'; | |
| 251 # for aa there is usually no 'molecule' (mRNA etc) | |
| 252 if (($params{'-alphabet'} eq 'dna') || (@tokens > 2)) { | |
| 253 $params{'-molecule'} = shift(@tokens); | |
| 254 my $circ = shift(@tokens); | |
| 255 if ($circ eq 'circular') { | |
| 256 $params{'-is_circular'} = 1; | |
| 257 $params{'-division'} = shift(@tokens); | |
| 258 } else { | |
| 259 # 'linear' or 'circular' may actually be omitted altogether | |
| 260 $params{'-division'} = | |
| 261 (CORE::length($circ) == 3 ) ? $circ : shift(@tokens); | |
| 262 } | |
| 263 } else { | |
| 264 $params{'-molecule'} = 'PRT' if($params{'-alphabet'} eq 'aa'); | |
| 265 $params{'-division'} = shift(@tokens); | |
| 266 } | |
| 267 my $date = join(' ', @tokens); # we lump together the rest | |
| 268 # this is per request bug #1513 | |
| 269 # we can handle | |
| 270 # 9-10-2003 | |
| 271 # 9-10-03 | |
| 272 #09-10-2003 | |
| 273 #09-10-03 | |
| 274 if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) { | |
| 275 if( length($date) < 11 ) { # improperly formatted date | |
| 276 # But we'll be nice and fix it for them | |
| 277 my ($d,$m,$y) = ($2,$3,$4); | |
| 278 if( length($d) == 1 ) { | |
| 279 $d = "0$d"; | |
| 280 } | |
| 281 # guess the century here | |
| 282 if( length($y) == 2 ) { | |
| 283 if( $y > 60 ) { # arbitrarily guess that '60' means 1960 | |
| 284 $y = "19$y"; | |
| 285 } else { | |
| 286 $y = "20$y"; | |
| 287 } | |
| 288 $self->warn("Date was malformed, guessing the century for $date to be $y\n"); | |
| 289 } | |
| 290 $params{'-dates'} = [join('-',$d,$m,$y)]; | |
| 291 } else { | |
| 292 $params{'-dates'} = [$date]; | |
| 293 } | |
| 294 } | |
| 295 | |
| 296 # set them all at once | |
| 297 $builder->add_slot_value(%params); | |
| 298 %params = (); | |
| 299 | |
| 300 # parse the rest if desired, otherwise start over | |
| 301 if(! $builder->want_object()) { | |
| 302 $builder->make_object(); | |
| 303 next RECORDSTART; | |
| 304 } | |
| 305 | |
| 306 # set up annotation depending on what the builder wants | |
| 307 if($builder->want_slot('annotation')) { | |
| 308 $annotation = new Bio::Annotation::Collection; | |
| 309 } | |
| 310 $buffer = $self->_readline(); | |
| 311 until( !defined ($buffer) ) { | |
| 312 $_ = $buffer; | |
| 313 | |
| 314 # Description line(s) | |
| 315 if (/^DEFINITION\s+(\S.*\S)/) { | |
| 316 my @desc = ($1); | |
| 317 while ( defined($_ = $self->_readline) ) { | |
| 318 if( /^\s+(.*)/ ) { push (@desc, $1); next }; | |
| 319 last; | |
| 320 } | |
| 321 $builder->add_slot_value(-desc => join(' ', @desc)); | |
| 322 # we'll continue right here because DEFINITION always comes | |
| 323 # at the top of the entry | |
| 324 } | |
| 325 # accession number (there can be multiple accessions) | |
| 326 if( /^ACCESSION\s+(\S.*\S)/ ) { | |
| 327 push(@acc, split(/\s+/,$1)); | |
| 328 while( defined($_ = $self->_readline) ) { | |
| 329 /^\s+(.*)/ && do { push (@acc, split(/\s+/,$1)); next }; | |
| 330 last; | |
| 331 } | |
| 332 $buffer = $_; | |
| 333 next; | |
| 334 } | |
| 335 # PID | |
| 336 elsif( /^PID\s+(\S+)/ ) { | |
| 337 $params{'-pid'} = $1; | |
| 338 } | |
| 339 #Version number | |
| 340 elsif( /^VERSION\s+(.+)$/ ) { | |
| 341 my ($acc,$gi) = split(' ',$1); | |
| 342 if($acc =~ /^\w+\.(\d+)/) { | |
| 343 $params{'-version'} = $1; | |
| 344 $params{'-seq_version'} = $1; | |
| 345 } | |
| 346 if($gi && (index($gi,"GI:") == 0)) { | |
| 347 $params{'-primary_id'} = substr($gi,3); | |
| 348 } | |
| 349 } | |
| 350 #Keywords | |
| 351 elsif( /^KEYWORDS\s+(.*)/ ) { | |
| 352 my @kw = split(/\s*\;\s*/,$1); | |
| 353 while( defined($_ = $self->_readline) ) { | |
| 354 chomp; | |
| 355 /^\s+(.*)/ && do { push (@kw, split(/\s*\;\s*/,$1)); next }; | |
| 356 last; | |
| 357 } | |
| 358 | |
| 359 @kw && $kw[-1] =~ s/\.$//; | |
| 360 $params{'-keywords'} = \@kw; | |
| 361 $buffer = $_; | |
| 362 next; | |
| 363 } | |
| 364 # Organism name and phylogenetic information | |
| 365 elsif (/^SOURCE/) { | |
| 366 if($builder->want_slot('species')) { | |
| 367 $species = $self->_read_GenBank_Species(\$buffer); | |
| 368 $builder->add_slot_value(-species => $species); | |
| 369 } else { | |
| 370 while(defined($buffer = $self->_readline())) { | |
| 371 last if substr($buffer,0,1) ne ' '; | |
| 372 } | |
| 373 } | |
| 374 next; | |
| 375 } | |
| 376 #References | |
| 377 elsif (/^REFERENCE/) { | |
| 378 if($annotation) { | |
| 379 my @refs = $self->_read_GenBank_References(\$buffer); | |
| 380 foreach my $ref ( @refs ) { | |
| 381 $annotation->add_Annotation('reference',$ref); | |
| 382 } | |
| 383 } else { | |
| 384 while(defined($buffer = $self->_readline())) { | |
| 385 last if substr($buffer,0,1) ne ' '; | |
| 386 } | |
| 387 } | |
| 388 next; | |
| 389 } | |
| 390 #Comments | |
| 391 elsif (/^COMMENT\s+(.*)/) { | |
| 392 if($annotation) { | |
| 393 my $comment = $1; | |
| 394 while (defined($_ = $self->_readline)) { | |
| 395 last if (/^\S/); | |
| 396 $comment .= $_; | |
| 397 } | |
| 398 $comment =~ s/\n/ /g; | |
| 399 $comment =~ s/ +/ /g; | |
| 400 $annotation->add_Annotation( | |
| 401 'comment', | |
| 402 Bio::Annotation::Comment->new(-text => $comment)); | |
| 403 $buffer = $_; | |
| 404 } else { | |
| 405 while(defined($buffer = $self->_readline())) { | |
| 406 last if substr($buffer,0,1) ne ' '; | |
| 407 } | |
| 408 } | |
| 409 next; | |
| 410 } elsif( /^SEGMENT\s+(.+)/ ) { | |
| 411 if($annotation) { | |
| 412 my $segment = $1; | |
| 413 while (defined($_ = $self->_readline)) { | |
| 414 last if (/^\S/); | |
| 415 $segment .= $_; | |
| 416 } | |
| 417 $segment =~ s/\n/ /g; | |
| 418 $segment =~ s/ +/ /g; | |
| 419 $annotation->add_Annotation( | |
| 420 'segment', | |
| 421 Bio::Annotation::SimpleValue->new(-value => $segment)); | |
| 422 $buffer = $_; | |
| 423 } else { | |
| 424 while(defined($buffer = $self->_readline())) { | |
| 425 last if substr($buffer,0,1) ne ' '; | |
| 426 } | |
| 427 } | |
| 428 next; | |
| 429 } | |
| 430 | |
| 431 # Exit at start of Feature table, or start of sequence | |
| 432 last if( /^(FEATURES|ORIGIN)/ ); | |
| 433 # Get next line and loop again | |
| 434 $buffer = $self->_readline; | |
| 435 } | |
| 436 return undef if(! defined($buffer)); | |
| 437 | |
| 438 # add them all at once for efficiency | |
| 439 $builder->add_slot_value(-accession_number => shift(@acc), | |
| 440 -secondary_accessions => \@acc, | |
| 441 %params); | |
| 442 $builder->add_slot_value(-annotation => $annotation) if $annotation; | |
| 443 %params = (); # reset before possible re-use to avoid setting twice | |
| 444 | |
| 445 # start over if we don't want to continue with this entry | |
| 446 if(! $builder->want_object()) { | |
| 447 $builder->make_object(); | |
| 448 next RECORDSTART; | |
| 449 } | |
| 450 | |
| 451 # some "minimal" formats may not necessarily have a feature table | |
| 452 if($builder->want_slot('features') && defined($_) && /^FEATURES/) { | |
| 453 # need to read the first line of the feature table | |
| 454 $buffer = $self->_readline; | |
| 455 | |
| 456 # DO NOT read lines in the while condition -- this is done as a side | |
| 457 # effect in _read_FTHelper_GenBank! | |
| 458 while( defined($buffer) ) { | |
| 459 # check immediately -- not at the end of the loop | |
| 460 # note: GenPept entries obviously do not have a BASE line | |
| 461 last if(($buffer =~ /^BASE/) || ($buffer =~ /^ORIGIN/) || | |
| 462 ($buffer =~ /^CONTIG/) ); | |
| 463 | |
| 464 # slurp in one feature at a time -- at return, the start of | |
| 465 # the next feature will have been read already, so we need | |
| 466 # to pass a reference, and the called method must set this | |
| 467 # to the last line read before returning | |
| 468 | |
| 469 my $ftunit = $self->_read_FTHelper_GenBank(\$buffer); | |
| 470 | |
| 471 # fix suggested by James Diggans | |
| 472 | |
| 473 if( !defined $ftunit ) { | |
| 474 # GRRRR. We have fallen over. Try to recover | |
| 475 $self->warn("Unexpected error in feature table for ".$params{'-display_id'}." Skipping feature, attempting to recover"); | |
| 476 unless( ($buffer =~ /^\s{5,5}\S+/) or ($buffer =~ /^\S+/)) { | |
| 477 $buffer = $self->_readline(); | |
| 478 } | |
| 479 next; # back to reading FTHelpers | |
| 480 } | |
| 481 | |
| 482 # process ftunit | |
| 483 my $feat = | |
| 484 $ftunit->_generic_seqfeature($self->location_factory(), | |
| 485 $display_id); | |
| 486 # add taxon_id from source if available | |
| 487 if($species && ($feat->primary_tag eq 'source') && | |
| 488 $feat->has_tag('db_xref') && (! $species->ncbi_taxid())) { | |
| 489 foreach my $tagval ($feat->get_tag_values('db_xref')) { | |
| 490 if(index($tagval,"taxon:") == 0) { | |
| 491 $species->ncbi_taxid(substr($tagval,6)); | |
| 492 } | |
| 493 } | |
| 494 } | |
| 495 # add feature to list of features | |
| 496 push(@features, $feat); | |
| 497 } | |
| 498 $builder->add_slot_value(-features => \@features); | |
| 499 $_ = $buffer; | |
| 500 } | |
| 501 if( defined ($_) ) { | |
| 502 if( /^CONTIG/ && $builder->want_slot('features')) { | |
| 503 $b = " $_"; # need 5 spaces to treat it like a feature | |
| 504 my $ftunit = $self->_read_FTHelper_GenBank(\$b); | |
| 505 if( ! defined $ftunit ) { | |
| 506 $self->warn("unable to parse the CONTIG feature\n"); | |
| 507 } else { | |
| 508 push(@features, | |
| 509 $ftunit->_generic_seqfeature($self->location_factory(), | |
| 510 $display_id)); | |
| 511 } | |
| 512 } elsif(! /^(ORIGIN|\/\/)/ ) { # advance to the sequence, if any | |
| 513 while (defined( $_ = $self->_readline) ) { | |
| 514 last if /^(ORIGIN|\/\/)/; | |
| 515 } | |
| 516 } | |
| 517 } | |
| 518 if(! $builder->want_object()) { | |
| 519 $builder->make_object(); # implicit end-of-object | |
| 520 next RECORDSTART; | |
| 521 } | |
| 522 if($builder->want_slot('seq')) { | |
| 523 # the fact that we want a sequence does not necessarily mean that | |
| 524 # there also is a sequence ... | |
| 525 if(defined($_) && s/^ORIGIN//) { | |
| 526 chomp; | |
| 527 if( $annotation && length($_) > 0 ) { | |
| 528 $annotation->add_Annotation('origin', | |
| 529 Bio::Annotation::SimpleValue->new(-value => $_)); | |
| 530 } | |
| 531 my $seqc = ''; | |
| 532 while( defined($_ = $self->_readline) ) { | |
| 533 /^\/\// && last; | |
| 534 $_ = uc($_); | |
| 535 s/[^A-Za-z]//g; | |
| 536 $seqc .= $_; | |
| 537 } | |
| 538 $self->debug("sequence length is ". length($seqc) ."\n"); | |
| 539 $builder->add_slot_value(-seq => $seqc); | |
| 540 } | |
| 541 } elsif ( defined($_) && (substr($_,0,2) ne '//')) { | |
| 542 # advance to the end of the record | |
| 543 while( defined($_ = $self->_readline) ) { | |
| 544 last if substr($_,0,2) eq '//'; | |
| 545 } | |
| 546 } | |
| 547 # Unlikely, but maybe the sequence is so weird that we don't want it | |
| 548 # anymore. We don't want to return undef if the stream's not exhausted | |
| 549 # yet. | |
| 550 $seq = $builder->make_object(); | |
| 551 next RECORDSTART unless $seq; | |
| 552 last RECORDSTART; | |
| 553 } # end while RECORDSTART | |
| 554 | |
| 555 return $seq; | |
| 556 } | |
| 557 | |
| 558 =head2 write_seq | |
| 559 | |
| 560 Title : write_seq | |
| 561 Usage : $stream->write_seq($seq) | |
| 562 Function: writes the $seq object (must be seq) to the stream | |
| 563 Returns : 1 for success and 0 for error | |
| 564 Args : array of 1 to n Bio::SeqI objects | |
| 565 | |
| 566 | |
| 567 =cut | |
| 568 | |
| 569 sub write_seq { | |
| 570 my ($self,@seqs) = @_; | |
| 571 | |
| 572 foreach my $seq ( @seqs ) { | |
| 573 $self->throw("Attempting to write with no seq!") unless defined $seq; | |
| 574 | |
| 575 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { | |
| 576 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); | |
| 577 } | |
| 578 | |
| 579 my $str = $seq->seq; | |
| 580 | |
| 581 my ($div, $mol); | |
| 582 my $len = $seq->length(); | |
| 583 | |
| 584 if ( $seq->can('division') ) { | |
| 585 $div=$seq->division; | |
| 586 } | |
| 587 if( !defined $div || ! $div ) { $div = 'UNK'; } | |
| 588 my $alpha = $seq->alphabet; | |
| 589 if( !$seq->can('molecule') || ! defined ($mol = $seq->molecule()) ) { | |
| 590 $mol = $alpha || 'DNA'; | |
| 591 } | |
| 592 | |
| 593 my $circular = 'linear '; | |
| 594 $circular = 'circular' if $seq->is_circular; | |
| 595 | |
| 596 local($^W) = 0; # supressing warnings about uninitialized fields. | |
| 597 | |
| 598 my $temp_line; | |
| 599 if( $self->_id_generation_func ) { | |
| 600 $temp_line = &{$self->_id_generation_func}($seq); | |
| 601 } else { | |
| 602 my $date = ''; | |
| 603 if( $seq->can('get_dates') ) { | |
| 604 ($date) = $seq->get_dates(); | |
| 605 } | |
| 606 $temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s", | |
| 607 'LOCUS', $seq->id(),$len, | |
| 608 (lc($alpha) eq 'protein') ? ('aa','', '') : | |
| 609 ('bp', '',$mol),$circular, | |
| 610 $div,$date); | |
| 611 } | |
| 612 | |
| 613 $self->_print("$temp_line\n"); | |
| 614 $self->_write_line_GenBank_regex("DEFINITION ", " ", | |
| 615 $seq->desc(),"\\s\+\|\$",80); | |
| 616 | |
| 617 # if there, write the accession line | |
| 618 | |
| 619 if( $self->_ac_generation_func ) { | |
| 620 $temp_line = &{$self->_ac_generation_func}($seq); | |
| 621 $self->_print("ACCESSION $temp_line\n"); | |
| 622 } else { | |
| 623 my @acc = (); | |
| 624 push(@acc, $seq->accession_number()); | |
| 625 if( $seq->isa('Bio::Seq::RichSeqI') ) { | |
| 626 push(@acc, $seq->get_secondary_accessions()); | |
| 627 } | |
| 628 $self->_print("ACCESSION ", join(" ", @acc), "\n"); | |
| 629 # otherwise - cannot print <sigh> | |
| 630 } | |
| 631 | |
| 632 # if PID defined, print it | |
| 633 if($seq->isa('Bio::Seq::RichSeqI') && $seq->pid()) { | |
| 634 $self->_print("PID ", $seq->pid(), "\n"); | |
| 635 } | |
| 636 | |
| 637 # if there, write the version line | |
| 638 | |
| 639 if( defined $self->_sv_generation_func() ) { | |
| 640 $temp_line = &{$self->_sv_generation_func}($seq); | |
| 641 if( $temp_line ) { | |
| 642 $self->_print("VERSION $temp_line\n"); | |
| 643 } | |
| 644 } else { | |
| 645 if($seq->isa('Bio::Seq::RichSeqI') && defined($seq->seq_version)) { | |
| 646 my $id = $seq->primary_id(); # this may be a GI number | |
| 647 $self->_print("VERSION ", | |
| 648 $seq->accession_number(), ".", $seq->seq_version, | |
| 649 ($id && ($id =~ /^\d+$/) ? " GI:".$id : ""), | |
| 650 "\n"); | |
| 651 } | |
| 652 } | |
| 653 | |
| 654 # if there, write the keywords line | |
| 655 | |
| 656 if( defined $self->_kw_generation_func() ) { | |
| 657 $temp_line = &{$self->_kw_generation_func}($seq); | |
| 658 $self->_print("KEYWORDS $temp_line\n"); | |
| 659 } else { | |
| 660 if( $seq->can('keywords') ) { | |
| 661 my $kw = $seq->keywords; | |
| 662 if( ref($kw) =~ /ARRAY/i ) { | |
| 663 $kw = join("; ", @$kw); | |
| 664 } | |
| 665 $kw .= '.' if( $kw !~ /\.$/ ); | |
| 666 $self->_print("KEYWORDS $kw\n"); | |
| 667 } | |
| 668 } | |
| 669 | |
| 670 # SEGMENT if it exists | |
| 671 foreach my $ref ( $seq->annotation->get_Annotations('segment') ) { | |
| 672 $self->_print(sprintf ("%-11s %s\n",'SEGMENT', | |
| 673 $ref->value)); | |
| 674 } | |
| 675 | |
| 676 # Organism lines | |
| 677 if (my $spec = $seq->species) { | |
| 678 my ($species, $genus, @class) = $spec->classification(); | |
| 679 my $OS; | |
| 680 if( $spec->common_name ) { | |
| 681 $OS = $spec->common_name; | |
| 682 } else { | |
| 683 $OS = "$genus $species"; | |
| 684 } | |
| 685 if (my $ssp = $spec->sub_species) { | |
| 686 $OS .= " $ssp"; | |
| 687 } | |
| 688 $self->_print("SOURCE $OS\n"); | |
| 689 $self->_print(" ORGANISM ", | |
| 690 ($spec->organelle() ? $spec->organelle()." " : ""), | |
| 691 "$genus $species", "\n"); | |
| 692 my $OC = join('; ', (reverse(@class), $genus)) .'.'; | |
| 693 $self->_write_line_GenBank_regex(' 'x12,' 'x12, | |
| 694 $OC,"\\s\+\|\$",80); | |
| 695 } | |
| 696 | |
| 697 # Reference lines | |
| 698 my $count = 1; | |
| 699 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { | |
| 700 $temp_line = sprintf ("REFERENCE $count (%s %d to %d)", | |
| 701 ($seq->alphabet() eq "protein" ? | |
| 702 "residues" : "bases"), | |
| 703 $ref->start,$ref->end); | |
| 704 $self->_print("$temp_line\n"); | |
| 705 $self->_write_line_GenBank_regex(" AUTHORS ",' 'x12, | |
| 706 $ref->authors,"\\s\+\|\$",80); | |
| 707 $self->_write_line_GenBank_regex(" TITLE "," "x12, | |
| 708 $ref->title,"\\s\+\|\$",80); | |
| 709 $self->_write_line_GenBank_regex(" JOURNAL "," "x12, | |
| 710 $ref->location,"\\s\+\|\$",80); | |
| 711 if ($ref->comment) { | |
| 712 $self->_write_line_GenBank_regex(" REMARK "," "x12, | |
| 713 $ref->comment,"\\s\+\|\$",80); | |
| 714 } | |
| 715 if( $ref->medline) { | |
| 716 $self->_write_line_GenBank_regex(" MEDLINE "," "x12, | |
| 717 $ref->medline, "\\s\+\|\$",80); | |
| 718 # I am assuming that pubmed entries only exist when there | |
| 719 # are also MEDLINE entries due to the indentation | |
| 720 # This could be a wrong assumption | |
| 721 if( $ref->pubmed ) { | |
| 722 $self->_write_line_GenBank_regex(" PUBMED "," "x12, | |
| 723 $ref->pubmed, "\\s\+\|\$", | |
| 724 80); | |
| 725 } | |
| 726 } | |
| 727 $count++; | |
| 728 } | |
| 729 # Comment lines | |
| 730 | |
| 731 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { | |
| 732 $self->_write_line_GenBank_regex("COMMENT "," "x12, | |
| 733 $comment->text,"\\s\+\|\$",80); | |
| 734 } | |
| 735 $self->_print("FEATURES Location/Qualifiers\n"); | |
| 736 | |
| 737 my $contig; | |
| 738 if( defined $self->_post_sort ) { | |
| 739 # we need to read things into an array. Process. Sort them. Print 'em | |
| 740 | |
| 741 my $post_sort_func = $self->_post_sort(); | |
| 742 my @fth; | |
| 743 | |
| 744 foreach my $sf ( $seq->top_SeqFeatures ) { | |
| 745 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq)); | |
| 746 } | |
| 747 | |
| 748 @fth = sort { &$post_sort_func($a,$b) } @fth; | |
| 749 | |
| 750 foreach my $fth ( @fth ) { | |
| 751 $self->_print_GenBank_FTHelper($fth); | |
| 752 } | |
| 753 } else { | |
| 754 # not post sorted. And so we can print as we get them. | |
| 755 # lower memory load... | |
| 756 | |
| 757 foreach my $sf ( $seq->top_SeqFeatures ) { | |
| 758 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq); | |
| 759 foreach my $fth ( @fth ) { | |
| 760 if( ! $fth->isa('Bio::SeqIO::FTHelper') ) { | |
| 761 $sf->throw("Cannot process FTHelper... $fth"); | |
| 762 } | |
| 763 $self->_print_GenBank_FTHelper($fth); | |
| 764 } | |
| 765 } | |
| 766 } | |
| 767 if( $seq->length == 0 ) { $self->_show_dna(0) } | |
| 768 | |
| 769 if( $self->_show_dna() == 0 ) { | |
| 770 $self->_print("\n//\n"); | |
| 771 return; | |
| 772 } | |
| 773 | |
| 774 # finished printing features. | |
| 775 | |
| 776 $str =~ tr/A-Z/a-z/; | |
| 777 | |
| 778 # Count each nucleotide | |
| 779 unless( $mol eq 'protein' ) { | |
| 780 my $alen = $str =~ tr/a/a/; | |
| 781 my $clen = $str =~ tr/c/c/; | |
| 782 my $glen = $str =~ tr/g/g/; | |
| 783 my $tlen = $str =~ tr/t/t/; | |
| 784 | |
| 785 my $olen = $len - ($alen + $tlen + $clen + $glen); | |
| 786 if( $olen < 0 ) { | |
| 787 $self->warn("Weird. More atgc than bases. Problem!"); | |
| 788 } | |
| 789 | |
| 790 my $base_count = sprintf("BASE COUNT %8s a %6s c %6s g %6s t%s\n", | |
| 791 $alen,$clen,$glen,$tlen, | |
| 792 ( $olen > 0 ) ? sprintf("%6s others",$olen) : ''); | |
| 793 $self->_print($base_count); | |
| 794 } | |
| 795 my ($o) = $seq->annotation->get_Annotations('origin'); | |
| 796 $self->_print(sprintf("%-6s%s\n",'ORIGIN',$o ? $o->value : '')); | |
| 797 | |
| 798 # print out the sequence | |
| 799 my $nuc = 60; # Number of nucleotides per line | |
| 800 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line | |
| 801 my $out_pat = 'A11' x 6; # Pattern for packing a line | |
| 802 my $length = length($str); | |
| 803 | |
| 804 # Calculate the number of nucleotides which fit on whole lines | |
| 805 my $whole = int($length / $nuc) * $nuc; | |
| 806 | |
| 807 # Print the whole lines | |
| 808 my $i; | |
| 809 for ($i = 0; $i < $whole; $i += $nuc) { | |
| 810 my $blocks = pack $out_pat, | |
| 811 unpack $whole_pat, | |
| 812 substr($str, $i, $nuc); | |
| 813 chop $blocks; | |
| 814 $self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59)); | |
| 815 } | |
| 816 | |
| 817 # Print the last line | |
| 818 if (my $last = substr($str, $i)) { | |
| 819 my $last_len = length($last); | |
| 820 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10; | |
| 821 my $blocks = pack $out_pat, | |
| 822 unpack($last_pat, $last); | |
| 823 $blocks =~ s/ +$//; | |
| 824 $self->_print(sprintf("%9d $blocks\n", $length - $last_len + 1)); | |
| 825 } | |
| 826 | |
| 827 $self->_print("//\n"); | |
| 828 | |
| 829 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
| 830 return 1; | |
| 831 } | |
| 832 } | |
| 833 | |
| 834 =head2 _print_GenBank_FTHelper | |
| 835 | |
| 836 Title : _print_GenBank_FTHelper | |
| 837 Usage : | |
| 838 Function: | |
| 839 Example : | |
| 840 Returns : | |
| 841 Args : | |
| 842 | |
| 843 | |
| 844 =cut | |
| 845 | |
| 846 sub _print_GenBank_FTHelper { | |
| 847 my ($self,$fth,$always_quote) = @_; | |
| 848 | |
| 849 if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) { | |
| 850 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!"); | |
| 851 } | |
| 852 if( defined $fth->key && | |
| 853 $fth->key eq 'CONTIG' ) { | |
| 854 $self->_write_line_GenBank_regex(sprintf("%-12s",$fth->key), | |
| 855 ' 'x12,$fth->loc,"\,\|\$",80); | |
| 856 } else { | |
| 857 $self->_write_line_GenBank_regex(sprintf(" %-16s",$fth->key), | |
| 858 " "x21, | |
| 859 $fth->loc,"\,\|\$",80); | |
| 860 } | |
| 861 | |
| 862 if( !defined $always_quote) { $always_quote = 0; } | |
| 863 | |
| 864 foreach my $tag ( keys %{$fth->field} ) { | |
| 865 foreach my $value ( @{$fth->field->{$tag}} ) { | |
| 866 $value =~ s/\"/\"\"/g; | |
| 867 if ($value eq "_no_value") { | |
| 868 $self->_write_line_GenBank_regex(" "x21, | |
| 869 " "x21, | |
| 870 "/$tag","\.\|\$",80); | |
| 871 } | |
| 872 elsif( $always_quote == 1 || $value !~ /^\d+$/ ) { | |
| 873 my ($pat) = ($value =~ /\s/ ? '\s|$' : '.|$'); | |
| 874 $self->_write_line_GenBank_regex(" "x21, | |
| 875 " "x21, | |
| 876 "/$tag=\"$value\"",$pat,80); | |
| 877 } else { | |
| 878 $self->_write_line_GenBank_regex(" "x21, | |
| 879 " "x21, | |
| 880 "/$tag=$value","\.\|\$",80); | |
| 881 } | |
| 882 } | |
| 883 } | |
| 884 | |
| 885 } | |
| 886 | |
| 887 | |
| 888 =head2 _read_GenBank_References | |
| 889 | |
| 890 Title : _read_GenBank_References | |
| 891 Usage : | |
| 892 Function: Reads references from GenBank format. Internal function really | |
| 893 Returns : | |
| 894 Args : | |
| 895 | |
| 896 | |
| 897 =cut | |
| 898 | |
| 899 sub _read_GenBank_References{ | |
| 900 my ($self,$buffer) = @_; | |
| 901 my (@refs); | |
| 902 my $ref; | |
| 903 | |
| 904 # assumme things are starting with RN | |
| 905 | |
| 906 if( $$buffer !~ /^REFERENCE/ ) { | |
| 907 warn("Not parsing line '$$buffer' which maybe important"); | |
| 908 } | |
| 909 | |
| 910 $_ = $$buffer; | |
| 911 | |
| 912 my (@title,@loc,@authors,@com,@medline,@pubmed); | |
| 913 | |
| 914 REFLOOP: while( defined($_) || defined($_ = $self->_readline) ) { | |
| 915 if (/^ AUTHORS\s+(.*)/) { | |
| 916 push (@authors, $1); | |
| 917 while ( defined($_ = $self->_readline) ) { | |
| 918 /^\s{3,}(.*)/ && do { push (@authors, $1);next;}; | |
| 919 last; | |
| 920 } | |
| 921 $ref->authors(join(' ', @authors)); | |
| 922 } | |
| 923 if (/^ TITLE\s+(.*)/) { | |
| 924 push (@title, $1); | |
| 925 while ( defined($_ = $self->_readline) ) { | |
| 926 /^\s{3,}(.*)/ && do { push (@title, $1); | |
| 927 next; | |
| 928 }; | |
| 929 last; | |
| 930 } | |
| 931 $ref->title(join(' ', @title)); | |
| 932 } | |
| 933 if (/^ JOURNAL\s+(.*)/) { | |
| 934 push(@loc, $1); | |
| 935 while ( defined($_ = $self->_readline) ) { | |
| 936 /^\s{3,}(.*)/ && do { push(@loc, $1); | |
| 937 next; | |
| 938 }; | |
| 939 last; | |
| 940 } | |
| 941 $ref->location(join(' ', @loc)); | |
| 942 redo REFLOOP; | |
| 943 } | |
| 944 if (/^ REMARK\s+(.*)/) { | |
| 945 push (@com, $1); | |
| 946 while ( defined($_ = $self->_readline) ) { | |
| 947 /^\s{3,}(.*)/ && do { push(@com, $1); | |
| 948 next; | |
| 949 }; | |
| 950 last; | |
| 951 } | |
| 952 $ref->comment(join(' ', @com)); | |
| 953 redo REFLOOP; | |
| 954 } | |
| 955 if( /^ MEDLINE\s+(.*)/ ) { | |
| 956 push(@medline,$1); | |
| 957 while ( defined($_ = $self->_readline) ) { | |
| 958 /^\s{4,}(.*)/ && do { push(@medline, $1); | |
| 959 next; | |
| 960 }; | |
| 961 last; | |
| 962 } | |
| 963 $ref->medline(join(' ', @medline)); | |
| 964 redo REFLOOP; | |
| 965 } | |
| 966 if( /^ PUBMED\s+(.*)/ ) { | |
| 967 push(@pubmed,$1); | |
| 968 while ( defined($_ = $self->_readline) ) { | |
| 969 /^\s{5,}(.*)/ && do { push(@pubmed, $1); | |
| 970 next; | |
| 971 }; | |
| 972 last; | |
| 973 } | |
| 974 $ref->pubmed(join(' ', @pubmed)); | |
| 975 redo REFLOOP; | |
| 976 } | |
| 977 | |
| 978 /^REFERENCE/ && do { | |
| 979 # store current reference | |
| 980 $self->_add_ref_to_array(\@refs,$ref) if $ref; | |
| 981 # reset | |
| 982 @authors = (); | |
| 983 @title = (); | |
| 984 @loc = (); | |
| 985 @com = (); | |
| 986 @pubmed = (); | |
| 987 @medline = (); | |
| 988 # create the new reference object | |
| 989 $ref = Bio::Annotation::Reference->new(); | |
| 990 # check whether start and end base is given | |
| 991 if (/^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)/){ | |
| 992 $ref->start($1); | |
| 993 $ref->end($2); | |
| 994 } | |
| 995 }; | |
| 996 | |
| 997 /^(FEATURES)|(COMMENT)/ && last; | |
| 998 | |
| 999 $_ = undef; # Empty $_ to trigger read of next line | |
| 1000 } | |
| 1001 | |
| 1002 # store last reference | |
| 1003 $self->_add_ref_to_array(\@refs,$ref) if $ref; | |
| 1004 | |
| 1005 $$buffer = $_; | |
| 1006 | |
| 1007 #print "\nnumber of references found: ", $#refs+1,"\n"; | |
| 1008 | |
| 1009 return @refs; | |
| 1010 } | |
| 1011 | |
| 1012 # | |
| 1013 # This is undocumented as it shouldn't be called by anywhere else as | |
| 1014 # read_GenBank_References. For those who still want to know: | |
| 1015 # | |
| 1016 # Purpose: adds a Reference object to an array of Reference objects, takes | |
| 1017 # care of possible cleanups to be done (currently, only author and title | |
| 1018 # will be chopped of trailing semicolons). | |
| 1019 # Parameters: | |
| 1020 # a reference to an array of Reference objects | |
| 1021 # the Reference object to be added | |
| 1022 # Returns: nothing | |
| 1023 # | |
| 1024 sub _add_ref_to_array { | |
| 1025 my ($self, $refs, $ref) = @_; | |
| 1026 | |
| 1027 # first, polish author and title by removing possible trailing semicolons | |
| 1028 my $au = $ref->authors(); | |
| 1029 my $title = $ref->title(); | |
| 1030 $au =~ s/;\s*$//g if $au; | |
| 1031 $title =~ s/;\s*$//g if $title; | |
| 1032 $ref->authors($au); | |
| 1033 $ref->title($title); | |
| 1034 # the rest should be clean already, so go ahead and add it | |
| 1035 push(@{$refs}, $ref); | |
| 1036 } | |
| 1037 | |
| 1038 =head2 _read_GenBank_Species | |
| 1039 | |
| 1040 Title : _read_GenBank_Species | |
| 1041 Usage : | |
| 1042 Function: Reads the GenBank Organism species and classification | |
| 1043 lines. | |
| 1044 Example : | |
| 1045 Returns : A Bio::Species object | |
| 1046 Args : a reference to the current line buffer | |
| 1047 | |
| 1048 =cut | |
| 1049 | |
| 1050 sub _read_GenBank_Species { | |
| 1051 my( $self,$buffer) = @_; | |
| 1052 my @organell_names = ("chloroplast", "mitochondr"); | |
| 1053 # only those carrying DNA, apart from the nucleus | |
| 1054 | |
| 1055 $_ = $$buffer; | |
| 1056 | |
| 1057 my( $sub_species, $species, $genus, $common, $organelle, @class ); | |
| 1058 # upon first entering the loop, we must not read a new line -- the SOURCE | |
| 1059 # line is already in the buffer (HL 05/10/2000) | |
| 1060 while (defined($_) || defined($_ = $self->_readline())) { | |
| 1061 # de-HTMLify (links that may be encountered here don't contain | |
| 1062 # escaped '>', so a simple-minded approach suffices) | |
| 1063 s/<[^>]+>//g; | |
| 1064 if (/^SOURCE\s+(.*)/) { | |
| 1065 # FIXME this is probably mostly wrong (e.g., it yields things like | |
| 1066 # Homo sapiens adult placenta cDNA to mRNA | |
| 1067 # which is certainly not what you want) | |
| 1068 $common = $1; | |
| 1069 $common =~ s/\.$//; # remove trailing dot | |
| 1070 } elsif (/^\s+ORGANISM/) { | |
| 1071 my @spflds = split(' ', $_); | |
| 1072 shift(@spflds); # ORGANISM | |
| 1073 if(grep { $_ =~ /^$spflds[0]/i; } @organell_names) { | |
| 1074 $organelle = shift(@spflds); | |
| 1075 } | |
| 1076 $genus = shift(@spflds); | |
| 1077 if(@spflds) { | |
| 1078 $species = shift(@spflds); | |
| 1079 } else { | |
| 1080 $species = "sp."; | |
| 1081 } | |
| 1082 $sub_species = shift(@spflds) if(@spflds); | |
| 1083 } elsif (/^\s+(.+)/) { | |
| 1084 # only split on ';' or '.' so that | |
| 1085 # classification that is 2 words will | |
| 1086 # still get matched | |
| 1087 # use map to remove trailing/leading spaces | |
| 1088 push(@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/, $1); | |
| 1089 } else { | |
| 1090 last; | |
| 1091 } | |
| 1092 | |
| 1093 $_ = undef; # Empty $_ to trigger read of next line | |
| 1094 } | |
| 1095 | |
| 1096 $$buffer = $_; | |
| 1097 | |
| 1098 # Don't make a species object if it's empty or "Unknown" or "None" | |
| 1099 return unless $genus and $genus !~ /^(Unknown|None)$/i; | |
| 1100 | |
| 1101 # Bio::Species array needs array in Species -> Kingdom direction | |
| 1102 if ($class[$#class] eq $genus) { | |
| 1103 push( @class, $species ); | |
| 1104 } else { | |
| 1105 push( @class, $genus, $species ); | |
| 1106 } | |
| 1107 @class = reverse @class; | |
| 1108 | |
| 1109 my $make = Bio::Species->new(); | |
| 1110 $make->classification( \@class, "FORCE" ); # no name validation please | |
| 1111 $make->common_name( $common ) if $common; | |
| 1112 $make->sub_species( $sub_species ) if $sub_species; | |
| 1113 $make->organelle($organelle) if $organelle; | |
| 1114 return $make; | |
| 1115 } | |
| 1116 | |
| 1117 =head2 _read_FTHelper_GenBank | |
| 1118 | |
| 1119 Title : _read_FTHelper_GenBank | |
| 1120 Usage : _read_FTHelper_GenBank($buffer) | |
| 1121 Function: reads the next FT key line | |
| 1122 Example : | |
| 1123 Returns : Bio::SeqIO::FTHelper object | |
| 1124 Args : filehandle and reference to a scalar | |
| 1125 | |
| 1126 | |
| 1127 =cut | |
| 1128 | |
| 1129 sub _read_FTHelper_GenBank { | |
| 1130 my ($self,$buffer) = @_; | |
| 1131 | |
| 1132 my ($key, # The key of the feature | |
| 1133 $loc # The location line from the feature | |
| 1134 ); | |
| 1135 my @qual = (); # An arrray of lines making up the qualifiers | |
| 1136 | |
| 1137 if ($$buffer =~ /^ (\S+)\s+(.+?)\s*$/o) { | |
| 1138 $key = $1; | |
| 1139 $loc = $2; | |
| 1140 # Read all the lines up to the next feature | |
| 1141 while ( defined($_ = $self->_readline) ) { | |
| 1142 if (/^(\s+)(.+?)\s*$/o) { | |
| 1143 # Lines inside features are preceded by 21 spaces | |
| 1144 # A new feature is preceded by 5 spaces | |
| 1145 if (length($1) > 6) { | |
| 1146 # Add to qualifiers if we're in the qualifiers, or if it's | |
| 1147 # the first qualifier | |
| 1148 if (@qual || (index($2,'/') == 0)) { | |
| 1149 push(@qual, $2); | |
| 1150 } | |
| 1151 # We're still in the location line, so append to location | |
| 1152 else { | |
| 1153 $loc .= $2; | |
| 1154 } | |
| 1155 } else { | |
| 1156 # We've reached the start of the next feature | |
| 1157 last; | |
| 1158 } | |
| 1159 } else { | |
| 1160 # We're at the end of the feature table | |
| 1161 last; | |
| 1162 } | |
| 1163 } | |
| 1164 } else { | |
| 1165 # No feature key | |
| 1166 $self->debug("no feature key!\n"); | |
| 1167 # change suggested by JDiggans to avoid infinite loop- | |
| 1168 # see bugreport 1062. | |
| 1169 # reset buffer to prevent infinite loop | |
| 1170 $$buffer = $self->_readline(); | |
| 1171 return; | |
| 1172 } | |
| 1173 | |
| 1174 # Put the first line of the next feature into the buffer | |
| 1175 $$buffer = $_; | |
| 1176 | |
| 1177 # Make the new FTHelper object | |
| 1178 my $out = new Bio::SeqIO::FTHelper(); | |
| 1179 $out->verbose($self->verbose()); | |
| 1180 $out->key($key); | |
| 1181 $out->loc($loc); | |
| 1182 | |
| 1183 # Now parse and add any qualifiers. (@qual is kept | |
| 1184 # intact to provide informative error messages.) | |
| 1185 QUAL: for (my $i = 0; $i < @qual; $i++) { | |
| 1186 $_ = $qual[$i]; | |
| 1187 my( $qualifier, $value ) = (m{^/([^=]+)(?:=(.+))?}) | |
| 1188 or $self->warn("cannot see new qualifier in feature $key: ". | |
| 1189 $qual[$i]); | |
| 1190 #or $self->throw("Can't see new qualifier in: $_\nfrom:\n" | |
| 1191 # . join('', map "$_\n", @qual)); | |
| 1192 $qualifier = '' unless( defined $qualifier); | |
| 1193 if (defined $value) { | |
| 1194 # Do we have a quoted value? | |
| 1195 if (substr($value, 0, 1) eq '"') { | |
| 1196 # Keep adding to value until we find the trailing quote | |
| 1197 # and the quotes are balanced | |
| 1198 while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) { | |
| 1199 if($i >= $#qual) { | |
| 1200 $self->warn("Unbalanced quote in:\n" . | |
| 1201 join('', map("$_\n", @qual)) . | |
| 1202 "No further qualifiers will " . | |
| 1203 "be added for this feature"); | |
| 1204 last QUAL; | |
| 1205 } | |
| 1206 $i++; # modifying a for-loop variable inside of the loop | |
| 1207 # is not the best programming style ... | |
| 1208 my $next = $qual[$i]; | |
| 1209 | |
| 1210 # add to value with a space unless the value appears | |
| 1211 # to be a sequence (translation for example) | |
| 1212 if(($value.$next) =~ /[^A-Za-z"-]/) { | |
| 1213 $value .= " "; | |
| 1214 } | |
| 1215 $value .= $next; | |
| 1216 } | |
| 1217 # Trim leading and trailing quotes | |
| 1218 $value =~ s/^"|"$//g; | |
| 1219 # Undouble internal quotes | |
| 1220 $value =~ s/""/\"/g; | |
| 1221 } | |
| 1222 } else { | |
| 1223 $value = '_no_value'; | |
| 1224 } | |
| 1225 # Store the qualifier | |
| 1226 $out->field->{$qualifier} ||= []; | |
| 1227 push(@{$out->field->{$qualifier}},$value); | |
| 1228 } | |
| 1229 return $out; | |
| 1230 } | |
| 1231 | |
| 1232 =head2 _write_line_GenBank | |
| 1233 | |
| 1234 Title : _write_line_GenBank | |
| 1235 Usage : | |
| 1236 Function: internal function | |
| 1237 Example : | |
| 1238 Returns : | |
| 1239 Args : | |
| 1240 | |
| 1241 | |
| 1242 =cut | |
| 1243 | |
| 1244 sub _write_line_GenBank{ | |
| 1245 my ($self,$pre1,$pre2,$line,$length) = @_; | |
| 1246 | |
| 1247 $length || $self->throw("Miscalled write_line_GenBank without length. Programming error!"); | |
| 1248 my $subl = $length - length $pre2; | |
| 1249 my $linel = length $line; | |
| 1250 my $i; | |
| 1251 | |
| 1252 my $sub = substr($line,0,$length - length $pre1); | |
| 1253 | |
| 1254 $self->_print("$pre1$sub\n"); | |
| 1255 | |
| 1256 for($i= ($length - length $pre1);$i < $linel;) { | |
| 1257 $sub = substr($line,$i,($subl)); | |
| 1258 $self->_print("$pre2$sub\n"); | |
| 1259 $i += $subl; | |
| 1260 } | |
| 1261 | |
| 1262 } | |
| 1263 | |
| 1264 =head2 _write_line_GenBank_regex | |
| 1265 | |
| 1266 Title : _write_line_GenBank_regex | |
| 1267 Usage : | |
| 1268 Function: internal function for writing lines of specified | |
| 1269 length, with different first and the next line | |
| 1270 left hand headers and split at specific points in the | |
| 1271 text | |
| 1272 Example : | |
| 1273 Returns : nothing | |
| 1274 Args : file handle, first header, second header, text-line, regex for line breaks, total line length | |
| 1275 | |
| 1276 | |
| 1277 =cut | |
| 1278 | |
| 1279 sub _write_line_GenBank_regex { | |
| 1280 my ($self,$pre1,$pre2,$line,$regex,$length) = @_; | |
| 1281 | |
| 1282 #print STDOUT "Going to print with $line!\n"; | |
| 1283 | |
| 1284 $length || $self->throw( "Miscalled write_line_GenBank without length. Programming error!"); | |
| 1285 | |
| 1286 # if( length $pre1 != length $pre2 ) { | |
| 1287 # $self->throw( "Programming error - cannot called write_line_GenBank_regex with different length pre1 and pre2 tags!"); | |
| 1288 # } | |
| 1289 | |
| 1290 my $subl = $length - (length $pre1) - 2; | |
| 1291 my @lines = (); | |
| 1292 | |
| 1293 CHUNK: while($line) { | |
| 1294 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) { | |
| 1295 if($line =~ m/^(.{1,$subl})($pat)(.*)/) { | |
| 1296 $line = $3; | |
| 1297 # be strict about not padding spaces according to | |
| 1298 # genbank format | |
| 1299 my $l = $1.$2; | |
| 1300 $l =~ s/\s+$//; | |
| 1301 push(@lines, $l); | |
| 1302 next CHUNK; | |
| 1303 } | |
| 1304 } | |
| 1305 # if we get here none of the patterns matched $subl or less chars | |
| 1306 $self->warn("trouble dissecting \"$line\" into chunks ". | |
| 1307 "of $subl chars or less - this tag won't print right"); | |
| 1308 # insert a space char to prevent infinite loops | |
| 1309 $line = substr($line,0,$subl) . " " . substr($line,$subl); | |
| 1310 } | |
| 1311 | |
| 1312 my $s = shift @lines; | |
| 1313 $self->_print("$pre1$s\n"); | |
| 1314 foreach my $s ( @lines ) { | |
| 1315 $self->_print("$pre2$s\n"); | |
| 1316 } | |
| 1317 } | |
| 1318 | |
| 1319 =head2 _post_sort | |
| 1320 | |
| 1321 Title : _post_sort | |
| 1322 Usage : $obj->_post_sort($newval) | |
| 1323 Function: | |
| 1324 Returns : value of _post_sort | |
| 1325 Args : newvalue (optional) | |
| 1326 | |
| 1327 | |
| 1328 =cut | |
| 1329 | |
| 1330 sub _post_sort{ | |
| 1331 my ($obj,$value) = @_; | |
| 1332 if( defined $value) { | |
| 1333 $obj->{'_post_sort'} = $value; | |
| 1334 } | |
| 1335 return $obj->{'_post_sort'}; | |
| 1336 } | |
| 1337 | |
| 1338 =head2 _show_dna | |
| 1339 | |
| 1340 Title : _show_dna | |
| 1341 Usage : $obj->_show_dna($newval) | |
| 1342 Function: | |
| 1343 Returns : value of _show_dna | |
| 1344 Args : newvalue (optional) | |
| 1345 | |
| 1346 | |
| 1347 =cut | |
| 1348 | |
| 1349 sub _show_dna{ | |
| 1350 my ($obj,$value) = @_; | |
| 1351 if( defined $value) { | |
| 1352 $obj->{'_show_dna'} = $value; | |
| 1353 } | |
| 1354 return $obj->{'_show_dna'}; | |
| 1355 } | |
| 1356 | |
| 1357 =head2 _id_generation_func | |
| 1358 | |
| 1359 Title : _id_generation_func | |
| 1360 Usage : $obj->_id_generation_func($newval) | |
| 1361 Function: | |
| 1362 Returns : value of _id_generation_func | |
| 1363 Args : newvalue (optional) | |
| 1364 | |
| 1365 | |
| 1366 =cut | |
| 1367 | |
| 1368 sub _id_generation_func{ | |
| 1369 my ($obj,$value) = @_; | |
| 1370 if( defined $value ) { | |
| 1371 $obj->{'_id_generation_func'} = $value; | |
| 1372 } | |
| 1373 return $obj->{'_id_generation_func'}; | |
| 1374 } | |
| 1375 | |
| 1376 =head2 _ac_generation_func | |
| 1377 | |
| 1378 Title : _ac_generation_func | |
| 1379 Usage : $obj->_ac_generation_func($newval) | |
| 1380 Function: | |
| 1381 Returns : value of _ac_generation_func | |
| 1382 Args : newvalue (optional) | |
| 1383 | |
| 1384 | |
| 1385 =cut | |
| 1386 | |
| 1387 sub _ac_generation_func{ | |
| 1388 my ($obj,$value) = @_; | |
| 1389 if( defined $value ) { | |
| 1390 $obj->{'_ac_generation_func'} = $value; | |
| 1391 } | |
| 1392 return $obj->{'_ac_generation_func'}; | |
| 1393 } | |
| 1394 | |
| 1395 =head2 _sv_generation_func | |
| 1396 | |
| 1397 Title : _sv_generation_func | |
| 1398 Usage : $obj->_sv_generation_func($newval) | |
| 1399 Function: | |
| 1400 Returns : value of _sv_generation_func | |
| 1401 Args : newvalue (optional) | |
| 1402 | |
| 1403 | |
| 1404 =cut | |
| 1405 | |
| 1406 sub _sv_generation_func{ | |
| 1407 my ($obj,$value) = @_; | |
| 1408 if( defined $value ) { | |
| 1409 $obj->{'_sv_generation_func'} = $value; | |
| 1410 } | |
| 1411 return $obj->{'_sv_generation_func'}; | |
| 1412 | |
| 1413 } | |
| 1414 | |
| 1415 =head2 _kw_generation_func | |
| 1416 | |
| 1417 Title : _kw_generation_func | |
| 1418 Usage : $obj->_kw_generation_func($newval) | |
| 1419 Function: | |
| 1420 Returns : value of _kw_generation_func | |
| 1421 Args : newvalue (optional) | |
| 1422 | |
| 1423 | |
| 1424 =cut | |
| 1425 | |
| 1426 sub _kw_generation_func{ | |
| 1427 my ($obj,$value) = @_; | |
| 1428 if( defined $value ) { | |
| 1429 $obj->{'_kw_generation_func'} = $value; | |
| 1430 } | |
| 1431 return $obj->{'_kw_generation_func'}; | |
| 1432 } | |
| 1433 | |
| 1434 1; |
