Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/SeqIO/genbank.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/genbank.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1434 @@ +# $Id: genbank.pm,v 1.76.2.12 2003/09/13 23:33:04 jason Exp $ +# +# BioPerl module for Bio::SeqIO::GenBank +# +# Cared for by Elia Stupka <elia@tll.org.sg> +# +# Copyright Elia Stupka +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::GenBank - GenBank sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SeqIO handler system. Go: + + $stream = Bio::SeqIO->new(-file => $filename, -format => 'GenBank'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from GenBank flat +file databases. + +There is alot of flexibility here about how to dump things which I need +to document fully. + +=head2 Mapping of record properties to object properties + +This section is supposed to document which sections and properties of +a GenBank databank record end up where in the Bioperl object model. It +is far from complete and presently focuses only on those mappings +which may be non-obvious. $seq in the text refers to the +Bio::Seq::RichSeqI implementing object returned by the parser for each +record. + +=over 4 + +=item GI number + +$seq-E<gt>primary_id + +=back + +=head2 Optional functions + +=over 3 + +=item _show_dna() + +(output only) shows the dna or not + +=item _post_sort() + +(output only) provides a sorting func which is applied to the FTHelpers +before printing + +=item _id_generation_func() + +This is function which is called as + + print "ID ", $func($seq), "\n"; + +To generate the ID line. If it is not there, it generates a sensible ID +line using a number of tools. + + +If you want to output annotations in genbank format they need to be +stored in a Bio::Annotation::Collection object which is accessible +through the Bio::SeqI interface method L<annotation()|annotation>. + +The following are the names of the keys which are polled from a +L<Bio::Annotation::Collection> object. + +reference - Should contain Bio::Annotation::Reference objects +comment - Should contain Bio::Annotation::Comment objects + +segment - Should contain a Bio::Annotation::SimpleValue object +origin - Should contain a Bio::Annotation::SimpleValue object + +=back + +=head1 Where does the data go? + +Data parsed in Bio::SeqIO::genbank is stored in a variety of data +fields in the sequence object that is returned. More information in +the HOWTOs about exactly what each field means and where it goes. +Here is a partial list of fields. + +Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you +the top level object which defines a function called NAME() which +stores this information. + +Items listed as Annotation 'NAME' tell you the data is stored the +associated Bio::Annotation::Colection object which is associated with +Bio::Seq objects. If it is explictly requested that no annotations +should be stored when parsing a record of course they won't be +available when you try and get them. If you are having this problem +look at the type of SeqBuilder that is being used to contruct your +sequence object. + +Comments Annotation 'comment' +References Annotation 'reference' +Segment Annotation 'segment' +Origin Annotation 'origin' + +Accessions PrimarySeq accession_number() +Secondary accessions RichSeq get_secondary_accessions() +Keywords RichSeq keywords() +Dates RichSeq get_dates() +Molecule RichSeq molecule() +Seq Version RichSeq seq_version() +PID RichSeq pid() +Division RichSeq division() +Features Seq get_SeqFeatures() +Alphabet PrimarySeq alphabet() +Definition PrimarySeq description() or desc() +Version PrimarySeq version() + +Sequence PrimarySeq seq() + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Elia Stupka + +Email elia@tll.org.sg + +=head1 CONTRIBUTORS + +Ewan Birney birney@ebi.ac.uk +Jason Stajich jason@bioperl.org +Chris Mungall cjm@fruitfly.bdgp.berkeley.edu +Lincoln Stein lstein@cshl.org +Heikki Lehvaslaiho, heikki@ebi.ac.uk +Hilmar Lapp, hlapp@gmx.net +Donald G. Jackson, donald.jackson@bms.com + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::genbank; +use vars qw(@ISA); +use strict; + +use Bio::SeqIO; +use Bio::SeqIO::FTHelper; +use Bio::SeqFeature::Generic; +use Bio::Species; +use Bio::Seq::SeqFactory; +use Bio::Annotation::Collection; +use Bio::Annotation::Comment; +use Bio::Annotation::Reference; +use Bio::Annotation::DBLink; + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + # hash for functions for decoding keys. + $self->{'_func_ftunit_hash'} = {}; + $self->_show_dna(1); # sets this to one by default. People can change it + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::RichSeq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : + +=cut + +sub next_seq { + my ($self,@args) = @_; + my $builder = $self->sequence_builder(); + my $seq; + my %params; + + RECORDSTART: while (1) { + my $buffer; + my (@acc, @features); + my ($display_id, $annotation); + my $species; + + # initialize; we may come here because of starting over + @features = (); + $annotation = undef; + @acc = (); + $species = undef; + %params = (-verbose => $self->verbose); # reset hash + local($/) = "\n"; + while(defined($buffer = $self->_readline())) { + last if index($buffer,'LOCUS ') == 0; + } + return undef if( !defined $buffer ); # end of file + $buffer =~ /^LOCUS\s+(\S.*)$/ || + $self->throw("GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'"); + + my @tokens = split(' ', $1); + + # this is important to have the id for display in e.g. FTHelper, + # otherwise you won't know which entry caused an error + $display_id = shift(@tokens); + $params{'-display_id'} = $display_id; + # may still be useful if we don't want the seq + $params{'-length'} = shift(@tokens); + # the alphabet of the entry + $params{'-alphabet'} = (lc(shift @tokens) eq 'bp') ? 'dna' : 'protein'; + # for aa there is usually no 'molecule' (mRNA etc) + if (($params{'-alphabet'} eq 'dna') || (@tokens > 2)) { + $params{'-molecule'} = shift(@tokens); + my $circ = shift(@tokens); + if ($circ eq 'circular') { + $params{'-is_circular'} = 1; + $params{'-division'} = shift(@tokens); + } else { + # 'linear' or 'circular' may actually be omitted altogether + $params{'-division'} = + (CORE::length($circ) == 3 ) ? $circ : shift(@tokens); + } + } else { + $params{'-molecule'} = 'PRT' if($params{'-alphabet'} eq 'aa'); + $params{'-division'} = shift(@tokens); + } + my $date = join(' ', @tokens); # we lump together the rest + # this is per request bug #1513 + # we can handle + # 9-10-2003 + # 9-10-03 + #09-10-2003 + #09-10-03 + if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) { + if( length($date) < 11 ) { # improperly formatted date + # But we'll be nice and fix it for them + my ($d,$m,$y) = ($2,$3,$4); + if( length($d) == 1 ) { + $d = "0$d"; + } + # guess the century here + if( length($y) == 2 ) { + if( $y > 60 ) { # arbitrarily guess that '60' means 1960 + $y = "19$y"; + } else { + $y = "20$y"; + } + $self->warn("Date was malformed, guessing the century for $date to be $y\n"); + } + $params{'-dates'} = [join('-',$d,$m,$y)]; + } else { + $params{'-dates'} = [$date]; + } + } + + # set them all at once + $builder->add_slot_value(%params); + %params = (); + + # parse the rest if desired, otherwise start over + if(! $builder->want_object()) { + $builder->make_object(); + next RECORDSTART; + } + + # set up annotation depending on what the builder wants + if($builder->want_slot('annotation')) { + $annotation = new Bio::Annotation::Collection; + } + $buffer = $self->_readline(); + until( !defined ($buffer) ) { + $_ = $buffer; + + # Description line(s) + if (/^DEFINITION\s+(\S.*\S)/) { + my @desc = ($1); + while ( defined($_ = $self->_readline) ) { + if( /^\s+(.*)/ ) { push (@desc, $1); next }; + last; + } + $builder->add_slot_value(-desc => join(' ', @desc)); + # we'll continue right here because DEFINITION always comes + # at the top of the entry + } + # accession number (there can be multiple accessions) + if( /^ACCESSION\s+(\S.*\S)/ ) { + push(@acc, split(/\s+/,$1)); + while( defined($_ = $self->_readline) ) { + /^\s+(.*)/ && do { push (@acc, split(/\s+/,$1)); next }; + last; + } + $buffer = $_; + next; + } + # PID + elsif( /^PID\s+(\S+)/ ) { + $params{'-pid'} = $1; + } + #Version number + elsif( /^VERSION\s+(.+)$/ ) { + my ($acc,$gi) = split(' ',$1); + if($acc =~ /^\w+\.(\d+)/) { + $params{'-version'} = $1; + $params{'-seq_version'} = $1; + } + if($gi && (index($gi,"GI:") == 0)) { + $params{'-primary_id'} = substr($gi,3); + } + } + #Keywords + elsif( /^KEYWORDS\s+(.*)/ ) { + my @kw = split(/\s*\;\s*/,$1); + while( defined($_ = $self->_readline) ) { + chomp; + /^\s+(.*)/ && do { push (@kw, split(/\s*\;\s*/,$1)); next }; + last; + } + + @kw && $kw[-1] =~ s/\.$//; + $params{'-keywords'} = \@kw; + $buffer = $_; + next; + } + # Organism name and phylogenetic information + elsif (/^SOURCE/) { + if($builder->want_slot('species')) { + $species = $self->_read_GenBank_Species(\$buffer); + $builder->add_slot_value(-species => $species); + } else { + while(defined($buffer = $self->_readline())) { + last if substr($buffer,0,1) ne ' '; + } + } + next; + } + #References + elsif (/^REFERENCE/) { + if($annotation) { + my @refs = $self->_read_GenBank_References(\$buffer); + foreach my $ref ( @refs ) { + $annotation->add_Annotation('reference',$ref); + } + } else { + while(defined($buffer = $self->_readline())) { + last if substr($buffer,0,1) ne ' '; + } + } + next; + } + #Comments + elsif (/^COMMENT\s+(.*)/) { + if($annotation) { + my $comment = $1; + while (defined($_ = $self->_readline)) { + last if (/^\S/); + $comment .= $_; + } + $comment =~ s/\n/ /g; + $comment =~ s/ +/ /g; + $annotation->add_Annotation( + 'comment', + Bio::Annotation::Comment->new(-text => $comment)); + $buffer = $_; + } else { + while(defined($buffer = $self->_readline())) { + last if substr($buffer,0,1) ne ' '; + } + } + next; + } elsif( /^SEGMENT\s+(.+)/ ) { + if($annotation) { + my $segment = $1; + while (defined($_ = $self->_readline)) { + last if (/^\S/); + $segment .= $_; + } + $segment =~ s/\n/ /g; + $segment =~ s/ +/ /g; + $annotation->add_Annotation( + 'segment', + Bio::Annotation::SimpleValue->new(-value => $segment)); + $buffer = $_; + } else { + while(defined($buffer = $self->_readline())) { + last if substr($buffer,0,1) ne ' '; + } + } + next; + } + + # Exit at start of Feature table, or start of sequence + last if( /^(FEATURES|ORIGIN)/ ); + # Get next line and loop again + $buffer = $self->_readline; + } + return undef if(! defined($buffer)); + + # add them all at once for efficiency + $builder->add_slot_value(-accession_number => shift(@acc), + -secondary_accessions => \@acc, + %params); + $builder->add_slot_value(-annotation => $annotation) if $annotation; + %params = (); # reset before possible re-use to avoid setting twice + + # start over if we don't want to continue with this entry + if(! $builder->want_object()) { + $builder->make_object(); + next RECORDSTART; + } + + # some "minimal" formats may not necessarily have a feature table + if($builder->want_slot('features') && defined($_) && /^FEATURES/) { + # need to read the first line of the feature table + $buffer = $self->_readline; + + # DO NOT read lines in the while condition -- this is done as a side + # effect in _read_FTHelper_GenBank! + while( defined($buffer) ) { + # check immediately -- not at the end of the loop + # note: GenPept entries obviously do not have a BASE line + last if(($buffer =~ /^BASE/) || ($buffer =~ /^ORIGIN/) || + ($buffer =~ /^CONTIG/) ); + + # slurp in one feature at a time -- at return, the start of + # the next feature will have been read already, so we need + # to pass a reference, and the called method must set this + # to the last line read before returning + + my $ftunit = $self->_read_FTHelper_GenBank(\$buffer); + + # fix suggested by James Diggans + + if( !defined $ftunit ) { + # GRRRR. We have fallen over. Try to recover + $self->warn("Unexpected error in feature table for ".$params{'-display_id'}." Skipping feature, attempting to recover"); + unless( ($buffer =~ /^\s{5,5}\S+/) or ($buffer =~ /^\S+/)) { + $buffer = $self->_readline(); + } + next; # back to reading FTHelpers + } + + # process ftunit + my $feat = + $ftunit->_generic_seqfeature($self->location_factory(), + $display_id); + # add taxon_id from source if available + if($species && ($feat->primary_tag eq 'source') && + $feat->has_tag('db_xref') && (! $species->ncbi_taxid())) { + foreach my $tagval ($feat->get_tag_values('db_xref')) { + if(index($tagval,"taxon:") == 0) { + $species->ncbi_taxid(substr($tagval,6)); + } + } + } + # add feature to list of features + push(@features, $feat); + } + $builder->add_slot_value(-features => \@features); + $_ = $buffer; + } + if( defined ($_) ) { + if( /^CONTIG/ && $builder->want_slot('features')) { + $b = " $_"; # need 5 spaces to treat it like a feature + my $ftunit = $self->_read_FTHelper_GenBank(\$b); + if( ! defined $ftunit ) { + $self->warn("unable to parse the CONTIG feature\n"); + } else { + push(@features, + $ftunit->_generic_seqfeature($self->location_factory(), + $display_id)); + } + } elsif(! /^(ORIGIN|\/\/)/ ) { # advance to the sequence, if any + while (defined( $_ = $self->_readline) ) { + last if /^(ORIGIN|\/\/)/; + } + } + } + if(! $builder->want_object()) { + $builder->make_object(); # implicit end-of-object + next RECORDSTART; + } + if($builder->want_slot('seq')) { + # the fact that we want a sequence does not necessarily mean that + # there also is a sequence ... + if(defined($_) && s/^ORIGIN//) { + chomp; + if( $annotation && length($_) > 0 ) { + $annotation->add_Annotation('origin', + Bio::Annotation::SimpleValue->new(-value => $_)); + } + my $seqc = ''; + while( defined($_ = $self->_readline) ) { + /^\/\// && last; + $_ = uc($_); + s/[^A-Za-z]//g; + $seqc .= $_; + } + $self->debug("sequence length is ". length($seqc) ."\n"); + $builder->add_slot_value(-seq => $seqc); + } + } elsif ( defined($_) && (substr($_,0,2) ne '//')) { + # advance to the end of the record + while( defined($_ = $self->_readline) ) { + last if substr($_,0,2) eq '//'; + } + } + # Unlikely, but maybe the sequence is so weird that we don't want it + # anymore. We don't want to return undef if the stream's not exhausted + # yet. + $seq = $builder->make_object(); + next RECORDSTART unless $seq; + last RECORDSTART; + } # end while RECORDSTART + + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object (must be seq) to the stream + Returns : 1 for success and 0 for error + Args : array of 1 to n Bio::SeqI objects + + +=cut + +sub write_seq { + my ($self,@seqs) = @_; + + foreach my $seq ( @seqs ) { + $self->throw("Attempting to write with no seq!") unless defined $seq; + + if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { + $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); + } + + my $str = $seq->seq; + + my ($div, $mol); + my $len = $seq->length(); + + if ( $seq->can('division') ) { + $div=$seq->division; + } + if( !defined $div || ! $div ) { $div = 'UNK'; } + my $alpha = $seq->alphabet; + if( !$seq->can('molecule') || ! defined ($mol = $seq->molecule()) ) { + $mol = $alpha || 'DNA'; + } + + my $circular = 'linear '; + $circular = 'circular' if $seq->is_circular; + + local($^W) = 0; # supressing warnings about uninitialized fields. + + my $temp_line; + if( $self->_id_generation_func ) { + $temp_line = &{$self->_id_generation_func}($seq); + } else { + my $date = ''; + if( $seq->can('get_dates') ) { + ($date) = $seq->get_dates(); + } + $temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s", + 'LOCUS', $seq->id(),$len, + (lc($alpha) eq 'protein') ? ('aa','', '') : + ('bp', '',$mol),$circular, + $div,$date); + } + + $self->_print("$temp_line\n"); + $self->_write_line_GenBank_regex("DEFINITION ", " ", + $seq->desc(),"\\s\+\|\$",80); + + # if there, write the accession line + + if( $self->_ac_generation_func ) { + $temp_line = &{$self->_ac_generation_func}($seq); + $self->_print("ACCESSION $temp_line\n"); + } else { + my @acc = (); + push(@acc, $seq->accession_number()); + if( $seq->isa('Bio::Seq::RichSeqI') ) { + push(@acc, $seq->get_secondary_accessions()); + } + $self->_print("ACCESSION ", join(" ", @acc), "\n"); + # otherwise - cannot print <sigh> + } + + # if PID defined, print it + if($seq->isa('Bio::Seq::RichSeqI') && $seq->pid()) { + $self->_print("PID ", $seq->pid(), "\n"); + } + + # if there, write the version line + + if( defined $self->_sv_generation_func() ) { + $temp_line = &{$self->_sv_generation_func}($seq); + if( $temp_line ) { + $self->_print("VERSION $temp_line\n"); + } + } else { + if($seq->isa('Bio::Seq::RichSeqI') && defined($seq->seq_version)) { + my $id = $seq->primary_id(); # this may be a GI number + $self->_print("VERSION ", + $seq->accession_number(), ".", $seq->seq_version, + ($id && ($id =~ /^\d+$/) ? " GI:".$id : ""), + "\n"); + } + } + + # if there, write the keywords line + + if( defined $self->_kw_generation_func() ) { + $temp_line = &{$self->_kw_generation_func}($seq); + $self->_print("KEYWORDS $temp_line\n"); + } else { + if( $seq->can('keywords') ) { + my $kw = $seq->keywords; + if( ref($kw) =~ /ARRAY/i ) { + $kw = join("; ", @$kw); + } + $kw .= '.' if( $kw !~ /\.$/ ); + $self->_print("KEYWORDS $kw\n"); + } + } + + # SEGMENT if it exists + foreach my $ref ( $seq->annotation->get_Annotations('segment') ) { + $self->_print(sprintf ("%-11s %s\n",'SEGMENT', + $ref->value)); + } + + # Organism lines + if (my $spec = $seq->species) { + my ($species, $genus, @class) = $spec->classification(); + my $OS; + if( $spec->common_name ) { + $OS = $spec->common_name; + } else { + $OS = "$genus $species"; + } + if (my $ssp = $spec->sub_species) { + $OS .= " $ssp"; + } + $self->_print("SOURCE $OS\n"); + $self->_print(" ORGANISM ", + ($spec->organelle() ? $spec->organelle()." " : ""), + "$genus $species", "\n"); + my $OC = join('; ', (reverse(@class), $genus)) .'.'; + $self->_write_line_GenBank_regex(' 'x12,' 'x12, + $OC,"\\s\+\|\$",80); + } + + # Reference lines + my $count = 1; + foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { + $temp_line = sprintf ("REFERENCE $count (%s %d to %d)", + ($seq->alphabet() eq "protein" ? + "residues" : "bases"), + $ref->start,$ref->end); + $self->_print("$temp_line\n"); + $self->_write_line_GenBank_regex(" AUTHORS ",' 'x12, + $ref->authors,"\\s\+\|\$",80); + $self->_write_line_GenBank_regex(" TITLE "," "x12, + $ref->title,"\\s\+\|\$",80); + $self->_write_line_GenBank_regex(" JOURNAL "," "x12, + $ref->location,"\\s\+\|\$",80); + if ($ref->comment) { + $self->_write_line_GenBank_regex(" REMARK "," "x12, + $ref->comment,"\\s\+\|\$",80); + } + if( $ref->medline) { + $self->_write_line_GenBank_regex(" MEDLINE "," "x12, + $ref->medline, "\\s\+\|\$",80); + # I am assuming that pubmed entries only exist when there + # are also MEDLINE entries due to the indentation + # This could be a wrong assumption + if( $ref->pubmed ) { + $self->_write_line_GenBank_regex(" PUBMED "," "x12, + $ref->pubmed, "\\s\+\|\$", + 80); + } + } + $count++; + } + # Comment lines + + foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { + $self->_write_line_GenBank_regex("COMMENT "," "x12, + $comment->text,"\\s\+\|\$",80); + } + $self->_print("FEATURES Location/Qualifiers\n"); + + my $contig; + if( defined $self->_post_sort ) { + # we need to read things into an array. Process. Sort them. Print 'em + + my $post_sort_func = $self->_post_sort(); + my @fth; + + foreach my $sf ( $seq->top_SeqFeatures ) { + push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq)); + } + + @fth = sort { &$post_sort_func($a,$b) } @fth; + + foreach my $fth ( @fth ) { + $self->_print_GenBank_FTHelper($fth); + } + } else { + # not post sorted. And so we can print as we get them. + # lower memory load... + + foreach my $sf ( $seq->top_SeqFeatures ) { + my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq); + foreach my $fth ( @fth ) { + if( ! $fth->isa('Bio::SeqIO::FTHelper') ) { + $sf->throw("Cannot process FTHelper... $fth"); + } + $self->_print_GenBank_FTHelper($fth); + } + } + } + if( $seq->length == 0 ) { $self->_show_dna(0) } + + if( $self->_show_dna() == 0 ) { + $self->_print("\n//\n"); + return; + } + +# finished printing features. + + $str =~ tr/A-Z/a-z/; + +# Count each nucleotide + unless( $mol eq 'protein' ) { + my $alen = $str =~ tr/a/a/; + my $clen = $str =~ tr/c/c/; + my $glen = $str =~ tr/g/g/; + my $tlen = $str =~ tr/t/t/; + + my $olen = $len - ($alen + $tlen + $clen + $glen); + if( $olen < 0 ) { + $self->warn("Weird. More atgc than bases. Problem!"); + } + + my $base_count = sprintf("BASE COUNT %8s a %6s c %6s g %6s t%s\n", + $alen,$clen,$glen,$tlen, + ( $olen > 0 ) ? sprintf("%6s others",$olen) : ''); + $self->_print($base_count); + } + my ($o) = $seq->annotation->get_Annotations('origin'); + $self->_print(sprintf("%-6s%s\n",'ORIGIN',$o ? $o->value : '')); + +# print out the sequence + my $nuc = 60; # Number of nucleotides per line + my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line + my $out_pat = 'A11' x 6; # Pattern for packing a line + my $length = length($str); + + # Calculate the number of nucleotides which fit on whole lines + my $whole = int($length / $nuc) * $nuc; + + # Print the whole lines + my $i; + for ($i = 0; $i < $whole; $i += $nuc) { + my $blocks = pack $out_pat, + unpack $whole_pat, + substr($str, $i, $nuc); + chop $blocks; + $self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59)); + } + + # Print the last line + if (my $last = substr($str, $i)) { + my $last_len = length($last); + my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10; + my $blocks = pack $out_pat, + unpack($last_pat, $last); + $blocks =~ s/ +$//; + $self->_print(sprintf("%9d $blocks\n", $length - $last_len + 1)); + } + + $self->_print("//\n"); + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; + } +} + +=head2 _print_GenBank_FTHelper + + Title : _print_GenBank_FTHelper + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _print_GenBank_FTHelper { + my ($self,$fth,$always_quote) = @_; + + if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) { + $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!"); + } + if( defined $fth->key && + $fth->key eq 'CONTIG' ) { + $self->_write_line_GenBank_regex(sprintf("%-12s",$fth->key), + ' 'x12,$fth->loc,"\,\|\$",80); + } else { + $self->_write_line_GenBank_regex(sprintf(" %-16s",$fth->key), + " "x21, + $fth->loc,"\,\|\$",80); + } + + if( !defined $always_quote) { $always_quote = 0; } + + foreach my $tag ( keys %{$fth->field} ) { + foreach my $value ( @{$fth->field->{$tag}} ) { + $value =~ s/\"/\"\"/g; + if ($value eq "_no_value") { + $self->_write_line_GenBank_regex(" "x21, + " "x21, + "/$tag","\.\|\$",80); + } + elsif( $always_quote == 1 || $value !~ /^\d+$/ ) { + my ($pat) = ($value =~ /\s/ ? '\s|$' : '.|$'); + $self->_write_line_GenBank_regex(" "x21, + " "x21, + "/$tag=\"$value\"",$pat,80); + } else { + $self->_write_line_GenBank_regex(" "x21, + " "x21, + "/$tag=$value","\.\|\$",80); + } + } + } + +} + + +=head2 _read_GenBank_References + + Title : _read_GenBank_References + Usage : + Function: Reads references from GenBank format. Internal function really + Returns : + Args : + + +=cut + +sub _read_GenBank_References{ + my ($self,$buffer) = @_; + my (@refs); + my $ref; + + # assumme things are starting with RN + + if( $$buffer !~ /^REFERENCE/ ) { + warn("Not parsing line '$$buffer' which maybe important"); + } + + $_ = $$buffer; + + my (@title,@loc,@authors,@com,@medline,@pubmed); + + REFLOOP: while( defined($_) || defined($_ = $self->_readline) ) { + if (/^ AUTHORS\s+(.*)/) { + push (@authors, $1); + while ( defined($_ = $self->_readline) ) { + /^\s{3,}(.*)/ && do { push (@authors, $1);next;}; + last; + } + $ref->authors(join(' ', @authors)); + } + if (/^ TITLE\s+(.*)/) { + push (@title, $1); + while ( defined($_ = $self->_readline) ) { + /^\s{3,}(.*)/ && do { push (@title, $1); + next; + }; + last; + } + $ref->title(join(' ', @title)); + } + if (/^ JOURNAL\s+(.*)/) { + push(@loc, $1); + while ( defined($_ = $self->_readline) ) { + /^\s{3,}(.*)/ && do { push(@loc, $1); + next; + }; + last; + } + $ref->location(join(' ', @loc)); + redo REFLOOP; + } + if (/^ REMARK\s+(.*)/) { + push (@com, $1); + while ( defined($_ = $self->_readline) ) { + /^\s{3,}(.*)/ && do { push(@com, $1); + next; + }; + last; + } + $ref->comment(join(' ', @com)); + redo REFLOOP; + } + if( /^ MEDLINE\s+(.*)/ ) { + push(@medline,$1); + while ( defined($_ = $self->_readline) ) { + /^\s{4,}(.*)/ && do { push(@medline, $1); + next; + }; + last; + } + $ref->medline(join(' ', @medline)); + redo REFLOOP; + } + if( /^ PUBMED\s+(.*)/ ) { + push(@pubmed,$1); + while ( defined($_ = $self->_readline) ) { + /^\s{5,}(.*)/ && do { push(@pubmed, $1); + next; + }; + last; + } + $ref->pubmed(join(' ', @pubmed)); + redo REFLOOP; + } + + /^REFERENCE/ && do { + # store current reference + $self->_add_ref_to_array(\@refs,$ref) if $ref; + # reset + @authors = (); + @title = (); + @loc = (); + @com = (); + @pubmed = (); + @medline = (); + # create the new reference object + $ref = Bio::Annotation::Reference->new(); + # check whether start and end base is given + if (/^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)/){ + $ref->start($1); + $ref->end($2); + } + }; + + /^(FEATURES)|(COMMENT)/ && last; + + $_ = undef; # Empty $_ to trigger read of next line + } + + # store last reference + $self->_add_ref_to_array(\@refs,$ref) if $ref; + + $$buffer = $_; + + #print "\nnumber of references found: ", $#refs+1,"\n"; + + return @refs; +} + +# +# This is undocumented as it shouldn't be called by anywhere else as +# read_GenBank_References. For those who still want to know: +# +# Purpose: adds a Reference object to an array of Reference objects, takes +# care of possible cleanups to be done (currently, only author and title +# will be chopped of trailing semicolons). +# Parameters: +# a reference to an array of Reference objects +# the Reference object to be added +# Returns: nothing +# +sub _add_ref_to_array { + my ($self, $refs, $ref) = @_; + + # first, polish author and title by removing possible trailing semicolons + my $au = $ref->authors(); + my $title = $ref->title(); + $au =~ s/;\s*$//g if $au; + $title =~ s/;\s*$//g if $title; + $ref->authors($au); + $ref->title($title); + # the rest should be clean already, so go ahead and add it + push(@{$refs}, $ref); +} + +=head2 _read_GenBank_Species + + Title : _read_GenBank_Species + Usage : + Function: Reads the GenBank Organism species and classification + lines. + Example : + Returns : A Bio::Species object + Args : a reference to the current line buffer + +=cut + +sub _read_GenBank_Species { + my( $self,$buffer) = @_; + my @organell_names = ("chloroplast", "mitochondr"); + # only those carrying DNA, apart from the nucleus + + $_ = $$buffer; + + my( $sub_species, $species, $genus, $common, $organelle, @class ); + # upon first entering the loop, we must not read a new line -- the SOURCE + # line is already in the buffer (HL 05/10/2000) + while (defined($_) || defined($_ = $self->_readline())) { + # de-HTMLify (links that may be encountered here don't contain + # escaped '>', so a simple-minded approach suffices) + s/<[^>]+>//g; + if (/^SOURCE\s+(.*)/) { + # FIXME this is probably mostly wrong (e.g., it yields things like + # Homo sapiens adult placenta cDNA to mRNA + # which is certainly not what you want) + $common = $1; + $common =~ s/\.$//; # remove trailing dot + } elsif (/^\s+ORGANISM/) { + my @spflds = split(' ', $_); + shift(@spflds); # ORGANISM + if(grep { $_ =~ /^$spflds[0]/i; } @organell_names) { + $organelle = shift(@spflds); + } + $genus = shift(@spflds); + if(@spflds) { + $species = shift(@spflds); + } else { + $species = "sp."; + } + $sub_species = shift(@spflds) if(@spflds); + } elsif (/^\s+(.+)/) { + # only split on ';' or '.' so that + # classification that is 2 words will + # still get matched + # use map to remove trailing/leading spaces + push(@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/, $1); + } else { + last; + } + + $_ = undef; # Empty $_ to trigger read of next line + } + + $$buffer = $_; + + # Don't make a species object if it's empty or "Unknown" or "None" + return unless $genus and $genus !~ /^(Unknown|None)$/i; + + # Bio::Species array needs array in Species -> Kingdom direction + if ($class[$#class] eq $genus) { + push( @class, $species ); + } else { + push( @class, $genus, $species ); + } + @class = reverse @class; + + my $make = Bio::Species->new(); + $make->classification( \@class, "FORCE" ); # no name validation please + $make->common_name( $common ) if $common; + $make->sub_species( $sub_species ) if $sub_species; + $make->organelle($organelle) if $organelle; + return $make; +} + +=head2 _read_FTHelper_GenBank + + Title : _read_FTHelper_GenBank + Usage : _read_FTHelper_GenBank($buffer) + Function: reads the next FT key line + Example : + Returns : Bio::SeqIO::FTHelper object + Args : filehandle and reference to a scalar + + +=cut + +sub _read_FTHelper_GenBank { + my ($self,$buffer) = @_; + + my ($key, # The key of the feature + $loc # The location line from the feature + ); + my @qual = (); # An arrray of lines making up the qualifiers + + if ($$buffer =~ /^ (\S+)\s+(.+?)\s*$/o) { + $key = $1; + $loc = $2; + # Read all the lines up to the next feature + while ( defined($_ = $self->_readline) ) { + if (/^(\s+)(.+?)\s*$/o) { + # Lines inside features are preceded by 21 spaces + # A new feature is preceded by 5 spaces + if (length($1) > 6) { + # Add to qualifiers if we're in the qualifiers, or if it's + # the first qualifier + if (@qual || (index($2,'/') == 0)) { + push(@qual, $2); + } + # We're still in the location line, so append to location + else { + $loc .= $2; + } + } else { + # We've reached the start of the next feature + last; + } + } else { + # We're at the end of the feature table + last; + } + } + } else { + # No feature key + $self->debug("no feature key!\n"); + # change suggested by JDiggans to avoid infinite loop- + # see bugreport 1062. + # reset buffer to prevent infinite loop + $$buffer = $self->_readline(); + return; + } + + # Put the first line of the next feature into the buffer + $$buffer = $_; + + # Make the new FTHelper object + my $out = new Bio::SeqIO::FTHelper(); + $out->verbose($self->verbose()); + $out->key($key); + $out->loc($loc); + + # Now parse and add any qualifiers. (@qual is kept + # intact to provide informative error messages.) + QUAL: for (my $i = 0; $i < @qual; $i++) { + $_ = $qual[$i]; + my( $qualifier, $value ) = (m{^/([^=]+)(?:=(.+))?}) + or $self->warn("cannot see new qualifier in feature $key: ". + $qual[$i]); + #or $self->throw("Can't see new qualifier in: $_\nfrom:\n" + # . join('', map "$_\n", @qual)); + $qualifier = '' unless( defined $qualifier); + if (defined $value) { + # Do we have a quoted value? + if (substr($value, 0, 1) eq '"') { + # Keep adding to value until we find the trailing quote + # and the quotes are balanced + while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) { + if($i >= $#qual) { + $self->warn("Unbalanced quote in:\n" . + join('', map("$_\n", @qual)) . + "No further qualifiers will " . + "be added for this feature"); + last QUAL; + } + $i++; # modifying a for-loop variable inside of the loop + # is not the best programming style ... + my $next = $qual[$i]; + + # add to value with a space unless the value appears + # to be a sequence (translation for example) + if(($value.$next) =~ /[^A-Za-z"-]/) { + $value .= " "; + } + $value .= $next; + } + # Trim leading and trailing quotes + $value =~ s/^"|"$//g; + # Undouble internal quotes + $value =~ s/""/\"/g; + } + } else { + $value = '_no_value'; + } + # Store the qualifier + $out->field->{$qualifier} ||= []; + push(@{$out->field->{$qualifier}},$value); + } + return $out; +} + +=head2 _write_line_GenBank + + Title : _write_line_GenBank + Usage : + Function: internal function + Example : + Returns : + Args : + + +=cut + +sub _write_line_GenBank{ + my ($self,$pre1,$pre2,$line,$length) = @_; + + $length || $self->throw("Miscalled write_line_GenBank without length. Programming error!"); + my $subl = $length - length $pre2; + my $linel = length $line; + my $i; + + my $sub = substr($line,0,$length - length $pre1); + + $self->_print("$pre1$sub\n"); + + for($i= ($length - length $pre1);$i < $linel;) { + $sub = substr($line,$i,($subl)); + $self->_print("$pre2$sub\n"); + $i += $subl; + } + +} + +=head2 _write_line_GenBank_regex + + Title : _write_line_GenBank_regex + Usage : + Function: internal function for writing lines of specified + length, with different first and the next line + left hand headers and split at specific points in the + text + Example : + Returns : nothing + Args : file handle, first header, second header, text-line, regex for line breaks, total line length + + +=cut + +sub _write_line_GenBank_regex { + my ($self,$pre1,$pre2,$line,$regex,$length) = @_; + + #print STDOUT "Going to print with $line!\n"; + + $length || $self->throw( "Miscalled write_line_GenBank without length. Programming error!"); + +# if( length $pre1 != length $pre2 ) { +# $self->throw( "Programming error - cannot called write_line_GenBank_regex with different length pre1 and pre2 tags!"); +# } + + my $subl = $length - (length $pre1) - 2; + my @lines = (); + + CHUNK: while($line) { + foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) { + if($line =~ m/^(.{1,$subl})($pat)(.*)/) { + $line = $3; + # be strict about not padding spaces according to + # genbank format + my $l = $1.$2; + $l =~ s/\s+$//; + push(@lines, $l); + next CHUNK; + } + } + # if we get here none of the patterns matched $subl or less chars + $self->warn("trouble dissecting \"$line\" into chunks ". + "of $subl chars or less - this tag won't print right"); + # insert a space char to prevent infinite loops + $line = substr($line,0,$subl) . " " . substr($line,$subl); + } + + my $s = shift @lines; + $self->_print("$pre1$s\n"); + foreach my $s ( @lines ) { + $self->_print("$pre2$s\n"); + } +} + +=head2 _post_sort + + Title : _post_sort + Usage : $obj->_post_sort($newval) + Function: + Returns : value of _post_sort + Args : newvalue (optional) + + +=cut + +sub _post_sort{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_post_sort'} = $value; + } + return $obj->{'_post_sort'}; +} + +=head2 _show_dna + + Title : _show_dna + Usage : $obj->_show_dna($newval) + Function: + Returns : value of _show_dna + Args : newvalue (optional) + + +=cut + +sub _show_dna{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_show_dna'} = $value; + } + return $obj->{'_show_dna'}; +} + +=head2 _id_generation_func + + Title : _id_generation_func + Usage : $obj->_id_generation_func($newval) + Function: + Returns : value of _id_generation_func + Args : newvalue (optional) + + +=cut + +sub _id_generation_func{ + my ($obj,$value) = @_; + if( defined $value ) { + $obj->{'_id_generation_func'} = $value; + } + return $obj->{'_id_generation_func'}; +} + +=head2 _ac_generation_func + + Title : _ac_generation_func + Usage : $obj->_ac_generation_func($newval) + Function: + Returns : value of _ac_generation_func + Args : newvalue (optional) + + +=cut + +sub _ac_generation_func{ + my ($obj,$value) = @_; + if( defined $value ) { + $obj->{'_ac_generation_func'} = $value; + } + return $obj->{'_ac_generation_func'}; +} + +=head2 _sv_generation_func + + Title : _sv_generation_func + Usage : $obj->_sv_generation_func($newval) + Function: + Returns : value of _sv_generation_func + Args : newvalue (optional) + + +=cut + +sub _sv_generation_func{ + my ($obj,$value) = @_; + if( defined $value ) { + $obj->{'_sv_generation_func'} = $value; + } + return $obj->{'_sv_generation_func'}; + +} + +=head2 _kw_generation_func + + Title : _kw_generation_func + Usage : $obj->_kw_generation_func($newval) + Function: + Returns : value of _kw_generation_func + Args : newvalue (optional) + + +=cut + +sub _kw_generation_func{ + my ($obj,$value) = @_; + if( defined $value ) { + $obj->{'_kw_generation_func'} = $value; + } + return $obj->{'_kw_generation_func'}; +} + +1;