Mercurial > repos > mahtabm > ensemb_rep_gvl

diff variant_effect_predictor/Bio/SeqIO/embl.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 06:29:17 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/SeqIO/embl.pm	Thu Apr 11 06:29:17 2013 -0400
@@ -0,0 +1,1240 @@
+# $Id: embl.pm,v 1.57.2.6 2003/09/14 19:06:51 jason Exp $
+#
+# BioPerl module for Bio::SeqIO::EMBL
+#
+# Cared for by Ewan Birney <birney@ebi.ac.uk>
+#
+# Copyright Ewan Birney
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::SeqIO::embl - EMBL sequence input/output stream
+
+=head1 SYNOPSIS
+
+It is probably best not to use this object directly, but
+rather go through the AnnSeqIO handler system. Go:
+
+    $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
+
+    while ( (my $seq = $stream->next_seq()) ) {
+	# do something with $seq
+    }
+
+=head1 DESCRIPTION
+
+This object can transform Bio::Seq objects to and from EMBL flat
+file databases.
+
+There is alot of flexibility here about how to dump things which I need
+to document fully.
+
+There should be a common object that this and genbank share (probably
+with swissprot). Too much of the magic is identical. 
+
+=head2 Optional functions
+
+=over 3
+
+=item _show_dna()
+
+(output only) shows the dna or not
+
+=item _post_sort()
+
+(output only) provides a sorting func which is applied to the FTHelpers
+before printing
+
+=item _id_generation_func()
+
+This is function which is called as 
+
+   print "ID   ", $func($annseq), "\n";
+
+To generate the ID line. If it is not there, it generates a sensible ID
+line using a number of tools.
+
+If you want to output annotations in embl format they need to be
+stored in a Bio::Annotation::Collection object which is accessible
+through the Bio::SeqI interface method L<annotation()|annotation>.  
+
+The following are the names of the keys which are polled from a
+L<Bio::Annotation::Collection> object.
+
+reference       - Should contain Bio::Annotation::Reference objects
+comment         - Should contain Bio::Annotation::Comment objects
+dblink          - Should contain Bio::Annotation::DBLink objects
+
+=back
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this
+and other Bioperl modules. Send your comments and suggestions preferably
+ to one of the Bioperl mailing lists.
+Your participation is much appreciated.
+
+  bioperl-l@bioperl.org                 - General discussion
+  http://www.bioperl.org/MailList.shtml - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+ the bugs and their resolution.
+ Bug reports can be submitted via email or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Ewan Birney
+
+Email birney@ebi.ac.uk
+
+Describe contact details here
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::SeqIO::embl;
+use vars qw(@ISA);
+use strict;
+use Bio::SeqIO::FTHelper;
+use Bio::SeqFeature::Generic;
+use Bio::Species;
+use Bio::Seq::SeqFactory;
+use Bio::Annotation::Collection;
+use Bio::Annotation::Comment;
+use Bio::Annotation::Reference;
+use Bio::Annotation::DBLink;
+
+@ISA = qw(Bio::SeqIO);
+
+sub _initialize {
+  my($self,@args) = @_;
+
+  $self->SUPER::_initialize(@args);  
+  # hash for functions for decoding keys.
+  $self->{'_func_ftunit_hash'} = {}; 
+  $self->_show_dna(1); # sets this to one by default. People can change it 
+  if( ! defined $self->sequence_factory ) {
+      $self->sequence_factory(new Bio::Seq::SeqFactory
+			      (-verbose => $self->verbose(), 
+			       -type => 'Bio::Seq::RichSeq'));
+  }
+}
+
+=head2 next_seq
+
+ Title   : next_seq
+ Usage   : $seq = $stream->next_seq()
+ Function: returns the next sequence in the stream
+ Returns : Bio::Seq object
+ Args    :
+
+=cut
+
+sub next_seq {
+   my ($self,@args) = @_;
+   my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div, 
+       $date, $comment, @date_arr);
+
+   my ($annotation, %params, @features) = ( new Bio::Annotation::Collection);
+
+   $line = $self->_readline;   # This needs to be before the first eof() test
+
+   if( !defined $line ) {
+       return undef; # no throws - end of file
+   }
+
+   if( $line =~ /^\s+$/ ) {
+       while( defined ($line = $self->_readline) ) {
+	   $line =~/^\S/ && last;
+       }
+   }   
+   if( !defined $line ) {
+       return undef; # end of file
+   }
+   $line =~ /^ID\s+\S+/ || $self->throw("EMBL stream with no ID. Not embl in my book");
+   $line =~ /^ID\s+(\S+)\s+\S+\;\s+([^;]+)\;\s+(\S+)\;/;
+   $name = $1;
+   $mol = $2;
+   $div = $3;
+   if(! $name) {
+       $name = "unknown id";
+   }
+   my $alphabet;
+
+    # this is important to have the id for display in e.g. FTHelper, otherwise
+    # you won't know which entry caused an error
+   if($mol) {
+       if ( $mol =~ /circular/ ) {
+	   $params{'-is_circular'} = 1;
+	   $mol =~  s|circular ||;
+       }
+       if (defined $mol ) {	   
+	   if ($mol =~ /DNA/) {
+	       $alphabet='dna';
+	   }
+	   elsif ($mol =~ /RNA/) {
+	       $alphabet='rna';
+	   }
+	   elsif ($mol =~ /AA/) {
+	       $alphabet='protein';
+	   }
+       }
+
+   }
+   
+#   $self->warn("not parsing upper annotation in EMBL file yet!");
+   my $buffer = $line;
+
+   local $_;
+ 
+   BEFORE_FEATURE_TABLE :
+   until( !defined $buffer ) {
+       $_ = $buffer;
+
+       # Exit at start of Feature table
+       last if /^F[HT]/;
+
+       # Description line(s)
+       if (/^DE\s+(\S.*\S)/) {
+           $desc .= $desc ? " $1" : $1;
+       }
+
+       #accession number
+       if( /^AC\s+(.*)?/ ) {
+	   my @accs = split(/[; ]+/, $1); # allow space in addition
+	   $params{'-accession_number'} = shift @accs 
+	       unless defined $params{'-accession_number'};
+	   push @{$params{'-secondary_accessions'}}, @accs;
+       }
+       
+       #version number
+       if( /^SV\s+\S+\.(\d+);?/ ) {
+	   my $sv = $1;
+	   #$sv =~ s/\;//;
+	   $params{'-seq_version'} = $sv;
+	   $params{'-version'} = $sv;
+       }
+
+       #date (NOTE: takes last date line)
+       if( /^DT\s+(.+)$/ ) {
+	   my $date = $1;
+	   push @{$params{'-dates'}}, $date;
+       }
+       
+       #keywords
+       if( /^KW\s+(.*)\S*$/ ) {
+	   my @kw = split(/\s*\;\s*/,$1);
+	   push @{$params{'-keywords'}}, @kw;
+       }
+
+       # Organism name and phylogenetic information
+       elsif (/^O[SC]/) {
+           my $species = $self->_read_EMBL_Species(\$buffer);
+           $params{'-species'}= $species;
+       }
+
+       # References
+       elsif (/^R/) {
+	   my @refs = $self->_read_EMBL_References(\$buffer);
+	   foreach my $ref ( @refs ) {
+	       $annotation->add_Annotation('reference',$ref);
+	   }
+       }
+       
+       # DB Xrefs
+       elsif (/^DR/) {
+	   my @links = $self->_read_EMBL_DBLink(\$buffer);
+	   foreach my $dblink ( @links ) {
+	       $annotation->add_Annotation('dblink',$dblink);
+	   }
+       }
+       
+       # Comments
+       elsif (/^CC\s+(.*)/) {
+	   $comment .= $1;
+	   $comment .= " ";
+	   while (defined ($_ = $self->_readline) ) {
+	       if (/^CC\s+(.*)/) {
+		   $comment .= $1;
+		   $comment .= " ";
+	       }
+	       else { 
+		   last;
+	       }
+	   }
+	   my $commobj = Bio::Annotation::Comment->new();
+	   $commobj->text($comment);
+	   $annotation->add_Annotation('comment',$commobj);
+	   $comment = "";
+       }
+
+       # Get next line.
+       $buffer = $self->_readline;
+   }
+
+   while( defined ($_ = $self->_readline) ) {
+       /^FT   \w/ && last;
+       /^SQ / && last;
+       /^CO / && last;
+   }
+   $buffer = $_;
+
+   if (defined($buffer) && $buffer =~ /^FT /) {
+     until( !defined ($buffer) ) {
+	 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
+	 # process ftunit
+
+	 push(@features,
+	      $ftunit->_generic_seqfeature($self->location_factory(), $name));
+
+	 if( $buffer !~ /^FT/ ) {
+	     last;
+	 }
+     }
+   }
+
+
+   # skip comments
+   while( defined ($buffer) && $buffer =~ /^XX/ ) { 
+       $buffer = $self->_readline();
+   }
+   
+   if( $buffer =~ /^CO/  ) {	   
+       until( !defined ($buffer) ) {
+	   my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
+	   # process ftunit
+	   push(@features,
+		$ftunit->_generic_seqfeature($self->location_factory(),
+					     $name));
+	   
+	   if( $buffer !~ /^CO/ ) {
+	       last;
+	   }
+       }       
+   }
+   if( $buffer !~ /^SQ/  ) {
+       while( defined ($_ = $self->_readline) ) {
+	   /^SQ/ && last;
+       }
+   }
+   
+   $seqc = "";	       
+   while( defined ($_ = $self->_readline) ) {
+       /^\/\// && last;
+       $_ = uc($_);
+       s/[^A-Za-z]//g;
+       $seqc .= $_;
+   }
+   my $seq = $self->sequence_factory->create
+       (-verbose => $self->verbose(),
+	-division => $div,
+	-seq => $seqc,
+	-desc => $desc,
+	-display_id => $name,
+	-annotation => $annotation,
+	-molecule => $mol,
+	-alphabet => $alphabet,
+	-features => \@features,
+	%params);
+
+   return $seq;
+}
+
+=head2 write_seq
+
+ Title   : write_seq
+ Usage   : $stream->write_seq($seq)
+ Function: writes the $seq object (must be seq) to the stream
+ Returns : 1 for success and 0 for error
+ Args    : array of 1 to n Bio::SeqI objects
+
+
+=cut
+
+sub write_seq {
+    my ($self,@seqs) = @_;
+
+    foreach my $seq ( @seqs ) { 
+	$self->throw("Attempting to write with no seq!") unless defined $seq;
+	unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
+	    $self->warn("$seq is not a SeqI compliant sequence object!") 
+		if $self->verbose >= 0;
+	    unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
+		$self->throw("$seq is not a PrimarySeqI compliant sequence object!");
+	    }
+	}
+	my $str = $seq->seq || '';
+
+	my $mol;
+	my $div;
+	my $len = $seq->length();
+
+	if ($seq->can('division') && defined $seq->division) {
+	    $div = $seq->division();
+	}
+	$div ||= 'UNK';
+
+	if ($seq->can('molecule')) {
+	    $mol = $seq->molecule();
+	    $mol = 'RNA' if defined $mol && $mol =~ /RNA/; # no 'mRNA' 
+	}
+	elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
+	    my $alphabet =$seq->primary_seq->alphabet;
+	    if ($alphabet eq 'dna') {
+		$mol ='DNA';
+	    }
+	    elsif ($alphabet eq 'rna') {
+		$mol='RNA';
+	    }
+	    elsif ($alphabet eq 'protein') {
+		$mol='AA';
+	    }
+	}
+	$mol ||= 'XXX';
+	$mol = "circular $mol" if $seq->is_circular;
+
+	my $temp_line;
+	if( $self->_id_generation_func ) {
+	    $temp_line = &{$self->_id_generation_func}($seq);
+	} else {
+	    $temp_line = sprintf("%-11sstandard; $mol; $div; %d BP.", $seq->id(), $len);
+	} 
+
+	$self->_print( "ID   $temp_line\n","XX\n");
+
+	# Write the accession line if present
+	my( $acc );
+	{
+	    if( my $func = $self->_ac_generation_func ) {
+		$acc = &{$func}($seq);
+	    } elsif( $seq->isa('Bio::Seq::RichSeqI') && 
+		     defined($seq->accession_number) ) {
+		$acc = $seq->accession_number;
+		$acc = join(";", $acc, $seq->get_secondary_accessions);
+	    } elsif ( $seq->can('accession_number') ) {
+		$acc = $seq->accession_number;
+	    }
+
+	    if (defined $acc) {
+		$self->_print("AC   $acc;\n",
+			      "XX\n");
+	    }
+	}
+
+	# Write the sv line if present
+	{
+	    my( $sv );
+	    if (my $func = $self->_sv_generation_func) {
+		$sv = &{$func}($seq);
+	    } elsif($seq->isa('Bio::Seq::RichSeqI') && 
+		    defined($seq->seq_version)) {
+		$sv = "$acc.". $seq->seq_version();
+	    }	
+	    if (defined $sv) {
+		$self->_print( "SV   $sv\n",
+			       "XX\n");
+	    }
+	}
+
+	# Date lines
+	my $switch=0;
+	if( $seq->can('get_dates') ) {
+	    foreach my $dt ( $seq->get_dates() ) {
+		$self->_write_line_EMBL_regex("DT   ","DT   ",$dt,'\s+|$',80);#'
+		    $switch=1;
+	    }
+	    if ($switch == 1) {
+		$self->_print("XX\n");
+	    }
+	}
+
+	# Description lines
+	$self->_write_line_EMBL_regex("DE   ","DE   ",$seq->desc(),'\s+|$',80);	#'
+	$self->_print( "XX\n");
+
+	# if there, write the kw line
+	{
+	    my( $kw );
+	    if( my $func = $self->_kw_generation_func ) {
+		$kw = &{$func}($seq);
+	    } elsif( $seq->can('keywords') ) {		
+		$kw = $seq->keywords;
+		if( ref($kw) =~ /ARRAY/i ) {
+		    $kw = join("; ", @$kw);
+		}
+		$kw .= '.' if( defined $kw && $kw !~ /\.$/ );
+	    }
+	    if (defined $kw) {
+		$self->_write_line_EMBL_regex("KW   ", "KW   ", 
+					      $kw, '\s+|$', 80); #'
+		$self->_print( "XX\n");
+		
+	    }
+	}
+
+	# Organism lines
+
+	if ($seq->can('species') && (my $spec = $seq->species)) {
+	    my($species, @class) = $spec->classification();
+	    my $genus = $class[0];
+	    my $OS = "$genus $species";
+	    if (my $ssp = $spec->sub_species) {
+		$OS .= " $ssp";
+	    }
+	    if (my $common = $spec->common_name) {
+		$OS .= " ($common)";
+	    }
+	    $self->_print("OS   $OS\n");
+	    my $OC = join('; ', reverse(@class)) .'.';
+	    $self->_write_line_EMBL_regex("OC   ","OC   ",$OC,'; |$',80); #'
+	    if ($spec->organelle) {
+		$self->_write_line_EMBL_regex("OG   ","OG   ",$spec->organelle,'; |$',80); #'
+	    }
+	    $self->_print("XX\n");
+	}
+
+	# Reference lines
+	my $t = 1;
+	if ( $seq->can('annotation') && defined $seq->annotation ) {
+	    foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
+		$self->_print( "RN   [$t]\n");
+
+		# Having no RP line is legal, but we need both
+		# start and end for a valid location.
+		my $start = $ref->start;
+		my $end   = $ref->end;
+		if ($start and $end) {
+		    $self->_print( "RP   $start-$end\n");
+		} elsif ($start or $end) {
+		    $self->throw("Both start and end are needed for a valid RP line.  Got: start='$start' end='$end'");
+		}
+
+		if (my $med = $ref->medline) {
+		    $self->_print( "RX   MEDLINE; $med.\n");
+		}
+		if (my $pm = $ref->pubmed) {
+		    $self->_print( "RX   PUBMED; $pm.\n");
+		}
+		$self->_write_line_EMBL_regex("RA   ", "RA   ", 
+					      $ref->authors . ";",
+					      '\s+|$', 80); #'
+
+		# If there is no title to the reference, it appears
+		# as a single semi-colon.  All titles must end in
+		# a semi-colon.
+		my $ref_title = $ref->title || '';
+		$ref_title =~ s/[\s;]*$/;/;
+		$self->_write_line_EMBL_regex("RT   ", "RT   ", $ref_title,    '\s+|$', 80); #'
+		$self->_write_line_EMBL_regex("RL   ", "RL   ", $ref->location, '\s+|$', 80); #'
+		if ($ref->comment) {
+		    $self->_write_line_EMBL_regex("RC   ", "RC   ", $ref->comment, '\s+|$', 80); #' 
+		}
+		$self->_print("XX\n");
+		$t++;
+	    }
+
+
+	    # DB Xref lines
+	    if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
+		foreach my $dr (@db_xref) {
+		    my $db_name = $dr->database;
+		    my $prim    = $dr->primary_id;
+		    my $opt     = $dr->optional_id || '';
+
+		    my $line = "$db_name; $prim; $opt.";
+		    $self->_write_line_EMBL_regex("DR   ", "DR   ", $line, '\s+|$', 80); #'
+		}
+		$self->_print("XX\n");
+	    }
+
+	    # Comment lines
+	    foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
+		$self->_write_line_EMBL_regex("CC   ", "CC   ", $comment->text, '\s+|$', 80); #'
+		$self->_print("XX\n");
+	    }
+	}
+	# "\\s\+\|\$"
+
+	## FEATURE TABLE
+
+	$self->_print("FH   Key             Location/Qualifiers\n");
+	$self->_print("FH\n");
+
+	my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
+	if ($feats[0]) {
+	    if( defined $self->_post_sort ) {
+		# we need to read things into an array. 
+		# Process. Sort them. Print 'em
+
+		my $post_sort_func = $self->_post_sort();
+		my @fth;
+
+		foreach my $sf ( @feats ) {
+		    push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
+		}
+
+		@fth = sort { &$post_sort_func($a,$b) } @fth;
+
+		foreach my $fth ( @fth ) {
+		    $self->_print_EMBL_FTHelper($fth);
+		}
+	    } else {
+		# not post sorted. And so we can print as we get them.
+		# lower memory load...
+
+		foreach my $sf ( @feats ) {
+		    my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
+		    foreach my $fth ( @fth ) {
+			if( $fth->key eq 'CONTIG') {
+			    $self->_show_dna(0);
+			}
+			$self->_print_EMBL_FTHelper($fth);
+		    }
+		}
+	    }
+	}
+
+	if( $self->_show_dna() == 0 ) {
+	    $self->_print( "//\n");
+	    return;
+	}
+	$self->_print( "XX\n");
+
+	# finished printing features.
+	
+	$str =~ tr/A-Z/a-z/;
+
+	# Count each nucleotide
+	my $alen = $str =~ tr/a/a/;
+	my $clen = $str =~ tr/c/c/;
+	my $glen = $str =~ tr/g/g/;
+	my $tlen = $str =~ tr/t/t/;
+
+	my $olen = $len - ($alen + $tlen + $clen + $glen);
+	if( $olen < 0 ) {
+	    $self->warn("Weird. More atgc than bases. Problem!");
+	}
+
+	$self->_print("SQ   Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n");
+
+	my $nuc = 60;		# Number of nucleotides per line
+	my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
+	my $out_pat   = 'A11' x 6; # Pattern for packing a line
+	my $length = length($str);
+
+	# Calculate the number of nucleotides which fit on whole lines
+	my $whole = int($length / $nuc) * $nuc;
+
+	# Print the whole lines
+	my( $i );
+	for ($i = 0; $i < $whole; $i += $nuc) {
+	    my $blocks = pack $out_pat,
+	    unpack $whole_pat,
+	    substr($str, $i, $nuc);
+	    $self->_print(sprintf("     $blocks%9d\n", $i + $nuc));
+	}
+
+	# Print the last line
+	if (my $last = substr($str, $i)) {
+	    my $last_len = length($last);
+	    my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
+	    my $blocks = pack $out_pat,
+	    unpack($last_pat, $last);
+	    $self->_print(sprintf("     $blocks%9d\n", $length)); # Add the length to the end
+	}
+
+	$self->_print( "//\n");
+
+	$self->flush if $self->_flush_on_write && defined $self->_fh;
+	return 1;
+    }
+}
+
+=head2 _print_EMBL_FTHelper
+
+ Title   : _print_EMBL_FTHelper
+ Usage   :
+ Function: Internal function
+ Returns : 
+ Args    :
+
+
+=cut
+
+sub _print_EMBL_FTHelper {
+   my ($self,$fth,$always_quote) = @_;
+   $always_quote ||= 0;
+   
+   if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
+       $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
+   }
+   
+
+   #$self->_print( "FH   Key             Location/Qualifiers\n");
+   #$self->_print( sprintf("FT   %-15s  %s\n",$fth->key,$fth->loc));
+   # let
+   if( $fth->key eq 'CONTIG' ) {
+       $self->_print("XX\n");
+       $self->_write_line_EMBL_regex("CO   ",
+				     "CO   ",$fth->loc,
+				     '\,|$',80); #'
+       return;
+   } 
+   $self->_write_line_EMBL_regex(sprintf("FT   %-15s ",$fth->key),
+				 "FT                   ",$fth->loc,
+				 '\,|$',80); #'
+   foreach my $tag ( keys %{$fth->field} ) {
+       if( ! defined $fth->field->{$tag} ) { next; } 
+       foreach my $value ( @{$fth->field->{$tag}} ) {
+	   $value =~ s/\"/\"\"/g;
+	   if ($value eq "_no_value") {
+	       $self->_write_line_EMBL_regex("FT                   ",
+					     "FT                   ",
+					     "/$tag",'.|$',80); #'
+	   }
+           elsif( $always_quote == 1 || $value !~ /^\d+$/ ) {
+              my $pat = $value =~ /\s/ ? '\s|$' : '.|$';
+	      $self->_write_line_EMBL_regex("FT                   ",
+					    "FT                   ",
+					    "/$tag=\"$value\"",$pat,80);
+           }
+           else {
+              $self->_write_line_EMBL_regex("FT                   ",
+					    "FT                   ",
+					    "/$tag=$value",'.|$',80); #'
+           }
+	  # $self->_print( "FT                   /", $tag, "=\"", $value, "\"\n");
+       }
+   }
+}
+
+#'
+=head2 _read_EMBL_References
+
+ Title   : _read_EMBL_References
+ Usage   :
+ Function: Reads references from EMBL format. Internal function really
+ Example :
+ Returns : 
+ Args    :
+
+
+=cut
+
+sub _read_EMBL_References {
+   my ($self,$buffer) = @_;
+   my (@refs);
+   
+   # assumme things are starting with RN
+
+   if( $$buffer !~ /^RN/ ) {
+       warn("Not parsing line '$$buffer' which maybe important");
+   }
+   my $b1;
+   my $b2;
+   my $title;
+   my $loc;
+   my $au;
+   my $med;
+   my $pm;
+   my $com;
+
+   while( defined ($_ = $self->_readline) ) {
+       /^R/ || last;
+       /^RP   (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
+       /^RX   MEDLINE;\s+(\d+)/ && do {$med=$1};
+       /^RX   PUBMED;\s+(\d+)/ && do {$pm=$1};       
+       /^RA   (.*)/ && do {
+	   $au = $self->_concatenate_lines($au,$1); next;
+       };
+       /^RT   (.*)/ && do {
+	   $title = $self->_concatenate_lines($title,$1); next;
+       };
+       /^RL   (.*)/ && do {
+	   $loc = $self->_concatenate_lines($loc,$1); next;
+       };
+       /^RC   (.*)/ && do {
+	   $com = $self->_concatenate_lines($com,$1); next;
+       };
+   }
+   
+   my $ref = new Bio::Annotation::Reference;
+   $au =~ s/;\s*$//g;
+   $title =~ s/;\s*$//g;
+
+   $ref->start($b1);
+   $ref->end($b2);
+   $ref->authors($au);
+   $ref->title($title);
+   $ref->location($loc);
+   $ref->medline($med);
+   $ref->comment($com);
+   $ref->pubmed($pm);
+
+   push(@refs,$ref);
+   $$buffer = $_;
+   
+   return @refs;
+}
+
+=head2 _read_EMBL_Species
+
+ Title   : _read_EMBL_Species
+ Usage   :
+ Function: Reads the EMBL Organism species and classification
+           lines.
+ Example :
+ Returns : A Bio::Species object
+ Args    : a reference to the current line buffer
+
+=cut
+
+sub _read_EMBL_Species {
+    my( $self, $buffer ) = @_;
+    my $org;
+
+    $_ = $$buffer;
+    my( $sub_species, $species, $genus, $common, @class );
+    while (defined( $_ ||= $self->_readline )) {
+        
+        if (/^OS\s+(\S+)(?:\s+([^\(]\S*))?(?:\s+([^\(]\S*))?(?:\s+\((.*)\))?/) {
+            $genus   = $1;
+	    $species = $2 || 'sp.';
+	    $sub_species = $3 if $3;
+            $common      = $4 if $4;
+        }
+        elsif (s/^OC\s+//) {
+	    # only split on ';' or '.' so that 
+	    # classification that is 2 words will 
+	    # still get matched
+	    # use map to remove trailing/leading spaces
+	    chomp;
+            push(@class,  map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/);
+        }
+	elsif (/^OG\s+(.*)/) {
+	    $org = $1;
+	}
+        else {
+            last;
+        }
+        
+        $_ = undef; # Empty $_ to trigger read of next line
+    }
+    
+    $$buffer = $_;
+    
+    # Don't make a species object if it is "Unknown" or "None"
+    return if $genus =~ /^(Unknown|None)$/i;
+
+    # Bio::Species array needs array in Species -> Kingdom direction
+    if ($class[$#class] eq $genus) {
+        push( @class, $species );
+    } else {
+        push( @class, $genus, $species );
+    }
+    @class = reverse @class;
+    
+    my $make = Bio::Species->new();
+    $make->classification( \@class, "FORCE" );  # no name validation please
+    $make->common_name( $common      ) if $common;
+    $make->sub_species( $sub_species ) if $sub_species;
+    $make->organelle  ( $org         ) if $org;
+    return $make;
+}
+
+=head2 _read_EMBL_DBLink
+
+ Title   : _read_EMBL_DBLink
+ Usage   :
+ Function: Reads the EMBL database cross reference ("DR") lines
+ Example :
+ Returns : A list of Bio::Annotation::DBLink objects
+ Args    :
+
+=cut
+
+sub _read_EMBL_DBLink {
+    my( $self,$buffer ) = @_;
+    my( @db_link );
+
+    $_ = $$buffer;
+    while (defined( $_ ||= $self->_readline )) {
+        
+        if (my($databse, $prim_id, $sec_id)
+                = /^DR   ([^\s;]+);\s*([^\s;]+);\s*([^\s;]+)?\.$/) {
+            my $link = Bio::Annotation::DBLink->new();
+            $link->database   ( $databse );
+            $link->primary_id ( $prim_id );
+            $link->optional_id( $sec_id  ) if $sec_id;
+            push(@db_link, $link);
+	}
+        else {
+            last;
+        }
+        
+        $_ = undef; # Empty $_ to trigger read of next line
+    }
+    
+    $$buffer = $_;
+    
+    return @db_link;
+}
+
+=head2 _filehandle
+
+ Title   : _filehandle
+ Usage   : $obj->_filehandle($newval)
+ Function: 
+ Example : 
+ Returns : value of _filehandle
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _filehandle{
+   my ($obj,$value) = @_;
+   if( defined $value) {
+      $obj->{'_filehandle'} = $value;
+    }
+    return $obj->{'_filehandle'};
+
+}
+
+=head2 _read_FTHelper_EMBL
+
+ Title   : _read_FTHelper_EMBL
+ Usage   : _read_FTHelper_EMBL($buffer)
+ Function: reads the next FT key line
+ Example :
+ Returns : Bio::SeqIO::FTHelper object 
+ Args    : filehandle and reference to a scalar
+
+
+=cut
+
+sub _read_FTHelper_EMBL {
+    my ($self,$buffer) = @_;
+    
+    my ($key,   # The key of the feature
+        $loc,   # The location line from the feature
+        @qual,  # An arrray of lines making up the qualifiers
+	);
+    
+    if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/) {
+        $key = $1;
+        $loc = $2;
+        # Read all the lines up to the next feature
+        while ( defined($_ = $self->_readline) ) {
+            if (/^FT(\s+)(.+?)\s*$/) {
+                # Lines inside features are preceeded by 19 spaces
+                # A new feature is preceeded by 3 spaces
+                if (length($1) > 4) {
+                    # Add to qualifiers if we're in the qualifiers
+                    if (@qual) {
+                        push(@qual, $2);
+                    }
+                    # Start the qualifier list if it's the first qualifier
+                    elsif (substr($2, 0, 1) eq '/') {
+                        @qual = ($2);
+                    }
+                    # We're still in the location line, so append to location
+                    else {
+                        $loc .= $2;
+                    }
+                } else {
+                    # We've reached the start of the next feature
+                    last;
+                }
+            } else {
+                # We're at the end of the feature table
+                last;
+            }
+        }
+    } elsif( $$buffer =~ /^CO\s+(\S+)/) {
+	$key = 'CONTIG';
+	$loc = $1;
+	# Read all the lines up to the next feature
+	while ( defined($_ = $self->_readline) ) {
+	    if (/^CO\s+(\S+)\s*$/) {
+		$loc .= $1;
+	    } else {
+		# We've reached the start of the next feature
+		last;
+	    }
+	}
+    } else {
+        # No feature key
+        return;
+    }
+    
+    # Put the first line of the next feature into the buffer
+    $$buffer = $_;
+
+    # Make the new FTHelper object
+    my $out = new Bio::SeqIO::FTHelper();
+    $out->verbose($self->verbose());
+    $out->key($key);
+    $out->loc($loc);
+
+    # Now parse and add any qualifiers.  (@qual is kept
+    # intact to provide informative error messages.)
+  QUAL: for (my $i = 0; $i < @qual; $i++) {
+        $_ = $qual[$i];
+        my( $qualifier, $value ) = m{^/([^=]+)(?:=(.+))?}
+            or $self->throw("Can't see new qualifier in: $_\nfrom:\n"
+                . join('', map "$_\n", @qual));
+        if (defined $value) {
+            # Do we have a quoted value?
+            if (substr($value, 0, 1) eq '"') {
+                # Keep adding to value until we find the trailing quote
+                # and the quotes are balanced
+                while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
+                    $i++;
+                    my $next = $qual[$i];
+                    unless (defined($next)) {
+                        warn("Unbalanced quote in:\n", map("$_\n", @qual),
+                            "No further qualifiers will be added for this feature");
+                        last QUAL;
+                    }
+
+                    # Join to value with space if value or next line contains a space
+                    $value .= (grep /\s/, ($value, $next)) ? " $next" : $next;
+                }
+                # Trim leading and trailing quotes
+                $value =~ s/^"|"$//g;
+                # Undouble internal quotes
+                $value =~ s/""/"/g; #"
+            }
+        } else {
+            $value = '_no_value';
+        }
+
+        # Store the qualifier
+        $out->field->{$qualifier} ||= [];
+        push(@{$out->field->{$qualifier}},$value);
+    }   
+
+    return $out;
+}
+
+=head2 _write_line_EMBL
+
+ Title   : _write_line_EMBL
+ Usage   :
+ Function: internal function
+ Example :
+ Returns : 
+ Args    :
+
+
+=cut
+
+sub _write_line_EMBL {
+   my ($self,$pre1,$pre2,$line,$length) = @_;
+
+   $length || die "Miscalled write_line_EMBL without length. Programming error!";
+   my $subl = $length - length $pre2;
+   my $linel = length $line;
+   my $i;
+
+   my $sub = substr($line,0,$length - length $pre1);
+
+   $self->_print( "$pre1$sub\n");
+   
+   for($i= ($length - length $pre1);$i < $linel;) {
+       $sub = substr($line,$i,($subl));
+       $self->_print( "$pre2$sub\n");
+       $i += $subl;
+   }
+
+}
+
+=head2 _write_line_EMBL_regex
+
+ Title   : _write_line_EMBL_regex
+ Usage   :
+ Function: internal function for writing lines of specified
+           length, with different first and the next line 
+           left hand headers and split at specific points in the
+           text
+ Example :
+ Returns : nothing
+ Args    : file handle, first header, second header, text-line, regex for line breaks, total line length
+
+
+=cut
+
+sub _write_line_EMBL_regex {
+    my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
+
+    #print STDOUT "Going to print with $line!\n";
+
+    $length || die "Programming error - called write_line_EMBL_regex without length.";
+
+    my $subl = $length - (length $pre1) -1 ;
+
+
+
+    my( @lines );
+    while(defined $line && 
+	  $line =~ m/(.{1,$subl})($regex)/g) {
+	push(@lines, $1.$2);
+    }
+    foreach (@lines) { s/\s+$//; }
+    
+    # Print first line
+    my $s = shift(@lines) || '';    
+    $self->_print( "$pre1$s\n");
+    
+    # Print the rest
+    foreach my $s ( @lines ) {
+	$s = '' if( !defined $s );
+        $self->_print( "$pre2$s\n");
+    }
+}
+
+=head2 _post_sort
+
+ Title   : _post_sort
+ Usage   : $obj->_post_sort($newval)
+ Function: 
+ Returns : value of _post_sort
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _post_sort{
+   my $obj = shift;
+   if( @_ ) {
+      my $value = shift;
+      $obj->{'_post_sort'} = $value;
+    }
+    return $obj->{'_post_sort'};
+
+}
+
+=head2 _show_dna
+
+ Title   : _show_dna
+ Usage   : $obj->_show_dna($newval)
+ Function: 
+ Returns : value of _show_dna
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _show_dna{
+   my $obj = shift;
+   if( @_ ) {
+      my $value = shift;
+      $obj->{'_show_dna'} = $value;
+    }
+    return $obj->{'_show_dna'};
+
+}
+
+=head2 _id_generation_func
+
+ Title   : _id_generation_func
+ Usage   : $obj->_id_generation_func($newval)
+ Function: 
+ Returns : value of _id_generation_func
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _id_generation_func{
+   my $obj = shift;
+   if( @_ ) {
+      my $value = shift;
+      $obj->{'_id_generation_func'} = $value;
+    }
+    return $obj->{'_id_generation_func'};
+
+}
+
+=head2 _ac_generation_func
+
+ Title   : _ac_generation_func
+ Usage   : $obj->_ac_generation_func($newval)
+ Function: 
+ Returns : value of _ac_generation_func
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _ac_generation_func{
+   my $obj = shift;
+   if( @_ ) {
+      my $value = shift;
+      $obj->{'_ac_generation_func'} = $value;
+    }
+    return $obj->{'_ac_generation_func'};
+
+}
+
+=head2 _sv_generation_func
+
+ Title   : _sv_generation_func
+ Usage   : $obj->_sv_generation_func($newval)
+ Function: 
+ Returns : value of _sv_generation_func
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _sv_generation_func{
+   my $obj = shift;
+   if( @_ ) {
+      my $value = shift;
+      $obj->{'_sv_generation_func'} = $value;
+    }
+    return $obj->{'_sv_generation_func'};
+
+}
+
+=head2 _kw_generation_func
+
+ Title   : _kw_generation_func
+ Usage   : $obj->_kw_generation_func($newval)
+ Function: 
+ Returns : value of _kw_generation_func
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _kw_generation_func{
+   my $obj = shift;
+   if( @_ ) {
+      my $value = shift;
+      $obj->{'_kw_generation_func'} = $value;
+    }
+    return $obj->{'_kw_generation_func'};
+
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 06:29:17 -0400
parents
children