ensemb_rep_gvl: variant_effect_predictor/Bio/SeqIO/embl.pm comparison

comparison variant_effect_predictor/Bio/SeqIO/embl.pm @ 0:2bc9b66ada89 draft default tip

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 06:29:17 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:2bc9b66ada89
+# $Id: embl.pm,v 1.57.2.6 2003/09/14 19:06:51 jason Exp $
+#
+# BioPerl module for Bio::SeqIO::EMBL
+#
+# Cared for by Ewan Birney <birney@ebi.ac.uk>
+#
+# Copyright Ewan Birney
+#
+# You may distribute this module under the same terms as perl itself
+# POD documentation - main docs before the code
+=head1 NAME
+Bio::SeqIO::embl - EMBL sequence input/output stream
+=head1 SYNOPSIS
+It is probably best not to use this object directly, but
+rather go through the AnnSeqIO handler system. Go:
+$stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
+while ( (my $seq = $stream->next_seq()) ) {
+	# do something with $seq
+}
+=head1 DESCRIPTION
+This object can transform Bio::Seq objects to and from EMBL flat
+file databases.
+There is alot of flexibility here about how to dump things which I need
+to document fully.
+There should be a common object that this and genbank share (probably
+with swissprot). Too much of the magic is identical.
+=head2 Optional functions
+=over 3
+=item _show_dna()
+(output only) shows the dna or not
+=item _post_sort()
+(output only) provides a sorting func which is applied to the FTHelpers
+before printing
+=item _id_generation_func()
+This is function which is called as
+print "ID   ", $func($annseq), "\n";
+To generate the ID line. If it is not there, it generates a sensible ID
+line using a number of tools.
+If you want to output annotations in embl format they need to be
+stored in a Bio::Annotation::Collection object which is accessible
+through the Bio::SeqI interface method L<annotation()|annotation>.
+The following are the names of the keys which are polled from a
+L<Bio::Annotation::Collection> object.
+reference       - Should contain Bio::Annotation::Reference objects
+comment         - Should contain Bio::Annotation::Comment objects
+dblink          - Should contain Bio::Annotation::DBLink objects
+=back
+=head1 FEEDBACK
+=head2 Mailing Lists
+User feedback is an integral part of the evolution of this
+and other Bioperl modules. Send your comments and suggestions preferably
+to one of the Bioperl mailing lists.
+Your participation is much appreciated.
+bioperl-l@bioperl.org                 - General discussion
+http://www.bioperl.org/MailList.shtml - About the mailing lists
+=head2 Reporting Bugs
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.
+Bug reports can be submitted via email or the web:
+bioperl-bugs@bio.perl.org
+http://bugzilla.bioperl.org/
+=head1 AUTHOR - Ewan Birney
+Email birney@ebi.ac.uk
+Describe contact details here
+=head1 APPENDIX
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+=cut
+# Let the code begin...
+package Bio::SeqIO::embl;
+use vars qw(@ISA);
+use strict;
+use Bio::SeqIO::FTHelper;
+use Bio::SeqFeature::Generic;
+use Bio::Species;
+use Bio::Seq::SeqFactory;
+use Bio::Annotation::Collection;
+use Bio::Annotation::Comment;
+use Bio::Annotation::Reference;
+use Bio::Annotation::DBLink;
+@ISA = qw(Bio::SeqIO);
+sub _initialize {
+my($self,@args) = @_;
+$self->SUPER::_initialize(@args);
+# hash for functions for decoding keys.
+$self->{'_func_ftunit_hash'} = {};
+$self->_show_dna(1); # sets this to one by default. People can change it
+if( ! defined $self->sequence_factory ) {
+$self->sequence_factory(new Bio::Seq::SeqFactory
+			      (-verbose => $self->verbose(),
+			       -type => 'Bio::Seq::RichSeq'));
+}
+}
+=head2 next_seq
+Title   : next_seq
+Usage   : $seq = $stream->next_seq()
+Function: returns the next sequence in the stream
+Returns : Bio::Seq object
+Args    :
+=cut
+sub next_seq {
+my ($self,@args) = @_;
+my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
+$date, $comment, @date_arr);
+my ($annotation, %params, @features) = ( new Bio::Annotation::Collection);
+$line = $self->_readline;   # This needs to be before the first eof() test
+if( !defined $line ) {
+return undef; # no throws - end of file
+}
+if( $line =~ /^\s+$/ ) {
+while( defined ($line = $self->_readline) ) {
+	   $line =~/^\S/ && last;
+}
+}
+if( !defined $line ) {
+return undef; # end of file
+}
+$line =~ /^ID\s+\S+/ || $self->throw("EMBL stream with no ID. Not embl in my book");
+$line =~ /^ID\s+(\S+)\s+\S+\;\s+([^;]+)\;\s+(\S+)\;/;
+$name = $1;
+$mol = $2;
+$div = $3;
+if(! $name) {
+$name = "unknown id";
+}
+my $alphabet;
+# this is important to have the id for display in e.g. FTHelper, otherwise
+# you won't know which entry caused an error
+if($mol) {
+if ( $mol =~ /circular/ ) {
+	   $params{'-is_circular'} = 1;
+	   $mol =~  s|circular ||;
+}
+if (defined $mol ) {
+	   if ($mol =~ /DNA/) {
+	       $alphabet='dna';
+	   }
+	   elsif ($mol =~ /RNA/) {
+	       $alphabet='rna';
+	   }
+	   elsif ($mol =~ /AA/) {
+	       $alphabet='protein';
+	   }
+}
+}
+#   $self->warn("not parsing upper annotation in EMBL file yet!");
+my $buffer = $line;
+local $_;
+BEFORE_FEATURE_TABLE :
+until( !defined $buffer ) {
+$_ = $buffer;
+# Exit at start of Feature table
+last if /^F[HT]/;
+# Description line(s)
+if (/^DE\s+(\S.*\S)/) {
+$desc .= $desc ? " $1" : $1;
+}
+#accession number
+if( /^AC\s+(.*)?/ ) {
+	   my @accs = split(/[; ]+/, $1); # allow space in addition
+	   $params{'-accession_number'} = shift @accs
+	       unless defined $params{'-accession_number'};
+	   push @{$params{'-secondary_accessions'}}, @accs;
+}
+#version number
+if( /^SV\s+\S+\.(\d+);?/ ) {
+	   my $sv = $1;
+	   #$sv =~ s/\;//;
+	   $params{'-seq_version'} = $sv;
+	   $params{'-version'} = $sv;
+}
+#date (NOTE: takes last date line)
+if( /^DT\s+(.+)$/ ) {
+	   my $date = $1;
+	   push @{$params{'-dates'}}, $date;
+}
+#keywords
+if( /^KW\s+(.*)\S*$/ ) {
+	   my @kw = split(/\s*\;\s*/,$1);
+	   push @{$params{'-keywords'}}, @kw;
+}
+# Organism name and phylogenetic information
+elsif (/^O[SC]/) {
+my $species = $self->_read_EMBL_Species(\$buffer);
+$params{'-species'}= $species;
+}
+# References
+elsif (/^R/) {
+	   my @refs = $self->_read_EMBL_References(\$buffer);
+	   foreach my $ref ( @refs ) {
+	       $annotation->add_Annotation('reference',$ref);
+	   }
+}
+# DB Xrefs
+elsif (/^DR/) {
+	   my @links = $self->_read_EMBL_DBLink(\$buffer);
+	   foreach my $dblink ( @links ) {
+	       $annotation->add_Annotation('dblink',$dblink);
+	   }
+}
+# Comments
+elsif (/^CC\s+(.*)/) {
+	   $comment .= $1;
+	   $comment .= " ";
+	   while (defined ($_ = $self->_readline) ) {
+	       if (/^CC\s+(.*)/) {
+		   $comment .= $1;
+		   $comment .= " ";
+	       }
+	       else {
+		   last;
+	       }
+	   }
+	   my $commobj = Bio::Annotation::Comment->new();
+	   $commobj->text($comment);
+	   $annotation->add_Annotation('comment',$commobj);
+	   $comment = "";
+}
+# Get next line.
+$buffer = $self->_readline;
+}
+while( defined ($_ = $self->_readline) ) {
+/^FT   \w/ && last;
+/^SQ / && last;
+/^CO / && last;
+}
+$buffer = $_;
+if (defined($buffer) && $buffer =~ /^FT /) {
+until( !defined ($buffer) ) {
+	 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
+	 # process ftunit
+	 push(@features,
+	      $ftunit->_generic_seqfeature($self->location_factory(), $name));
+	 if( $buffer !~ /^FT/ ) {
+	     last;
+	 }
+}
+}
+# skip comments
+while( defined ($buffer) && $buffer =~ /^XX/ ) {
+$buffer = $self->_readline();
+}
+if( $buffer =~ /^CO/  ) {
+until( !defined ($buffer) ) {
+	   my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
+	   # process ftunit
+	   push(@features,
+		$ftunit->_generic_seqfeature($self->location_factory(),
+					     $name));
+	   if( $buffer !~ /^CO/ ) {
+	       last;
+	   }
+}
+}
+if( $buffer !~ /^SQ/  ) {
+while( defined ($_ = $self->_readline) ) {
+	   /^SQ/ && last;
+}
+}
+$seqc = "";
+while( defined ($_ = $self->_readline) ) {
+/^\/\// && last;
+$_ = uc($_);
+s/[^A-Za-z]//g;
+$seqc .= $_;
+}
+my $seq = $self->sequence_factory->create
+(-verbose => $self->verbose(),
+	-division => $div,
+	-seq => $seqc,
+	-desc => $desc,
+	-display_id => $name,
+	-annotation => $annotation,
+	-molecule => $mol,
+	-alphabet => $alphabet,
+	-features => \@features,
+	%params);
+return $seq;
+}
+=head2 write_seq
+Title   : write_seq
+Usage   : $stream->write_seq($seq)
+Function: writes the $seq object (must be seq) to the stream
+Returns : 1 for success and 0 for error
+Args    : array of 1 to n Bio::SeqI objects
+=cut
+sub write_seq {
+my ($self,@seqs) = @_;
+foreach my $seq ( @seqs ) {
+	$self->throw("Attempting to write with no seq!") unless defined $seq;
+	unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
+	    $self->warn("$seq is not a SeqI compliant sequence object!")
+		if $self->verbose >= 0;
+	    unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
+		$self->throw("$seq is not a PrimarySeqI compliant sequence object!");
+	    }
+	}
+	my $str = $seq->seq || '';
+	my $mol;
+	my $div;
+	my $len = $seq->length();
+	if ($seq->can('division') && defined $seq->division) {
+	    $div = $seq->division();
+	}
+	$div ||= 'UNK';
+	if ($seq->can('molecule')) {
+	    $mol = $seq->molecule();
+	    $mol = 'RNA' if defined $mol && $mol =~ /RNA/; # no 'mRNA'
+	}
+	elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
+	    my $alphabet =$seq->primary_seq->alphabet;
+	    if ($alphabet eq 'dna') {
+		$mol ='DNA';
+	    }
+	    elsif ($alphabet eq 'rna') {
+		$mol='RNA';
+	    }
+	    elsif ($alphabet eq 'protein') {
+		$mol='AA';
+	    }
+	}
+	$mol ||= 'XXX';
+	$mol = "circular $mol" if $seq->is_circular;
+	my $temp_line;
+	if( $self->_id_generation_func ) {
+	    $temp_line = &{$self->_id_generation_func}($seq);
+	} else {
+	    $temp_line = sprintf("%-11sstandard; $mol; $div; %d BP.", $seq->id(), $len);
+	}
+	$self->_print( "ID   $temp_line\n","XX\n");
+	# Write the accession line if present
+	my( $acc );
+	{
+	    if( my $func = $self->_ac_generation_func ) {
+		$acc = &{$func}($seq);
+	    } elsif( $seq->isa('Bio::Seq::RichSeqI') &&
+		     defined($seq->accession_number) ) {
+		$acc = $seq->accession_number;
+		$acc = join(";", $acc, $seq->get_secondary_accessions);
+	    } elsif ( $seq->can('accession_number') ) {
+		$acc = $seq->accession_number;
+	    }
+	    if (defined $acc) {
+		$self->_print("AC   $acc;\n",
+			      "XX\n");
+	    }
+	}
+	# Write the sv line if present
+	{
+	    my( $sv );
+	    if (my $func = $self->_sv_generation_func) {
+		$sv = &{$func}($seq);
+	    } elsif($seq->isa('Bio::Seq::RichSeqI') &&
+		    defined($seq->seq_version)) {
+		$sv = "$acc.". $seq->seq_version();
+	    }
+	    if (defined $sv) {
+		$self->_print( "SV   $sv\n",
+			       "XX\n");
+	    }
+	}
+	# Date lines
+	my $switch=0;
+	if( $seq->can('get_dates') ) {
+	    foreach my $dt ( $seq->get_dates() ) {
+		$self->_write_line_EMBL_regex("DT   ","DT   ",$dt,'\s+|$',80);#'
+		    $switch=1;
+	    }
+	    if ($switch == 1) {
+		$self->_print("XX\n");
+	    }
+	}
+	# Description lines
+	$self->_write_line_EMBL_regex("DE   ","DE   ",$seq->desc(),'\s+|$',80);	#'
+	$self->_print( "XX\n");
+	# if there, write the kw line
+	{
+	    my( $kw );
+	    if( my $func = $self->_kw_generation_func ) {
+		$kw = &{$func}($seq);
+	    } elsif( $seq->can('keywords') ) {
+		$kw = $seq->keywords;
+		if( ref($kw) =~ /ARRAY/i ) {
+		    $kw = join("; ", @$kw);
+		}
+		$kw .= '.' if( defined $kw && $kw !~ /\.$/ );
+	    }
+	    if (defined $kw) {
+		$self->_write_line_EMBL_regex("KW   ", "KW   ",
+					      $kw, '\s+|$', 80); #'
+		$self->_print( "XX\n");
+	    }
+	}
+	# Organism lines
+	if ($seq->can('species') && (my $spec = $seq->species)) {
+	    my($species, @class) = $spec->classification();
+	    my $genus = $class[0];
+	    my $OS = "$genus $species";
+	    if (my $ssp = $spec->sub_species) {
+		$OS .= " $ssp";
+	    }
+	    if (my $common = $spec->common_name) {
+		$OS .= " ($common)";
+	    }
+	    $self->_print("OS   $OS\n");
+	    my $OC = join('; ', reverse(@class)) .'.';
+	    $self->_write_line_EMBL_regex("OC   ","OC   ",$OC,'; |$',80); #'
+	    if ($spec->organelle) {
+		$self->_write_line_EMBL_regex("OG   ","OG   ",$spec->organelle,'; |$',80); #'
+	    }
+	    $self->_print("XX\n");
+	}
+	# Reference lines
+	my $t = 1;
+	if ( $seq->can('annotation') && defined $seq->annotation ) {
+	    foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
+		$self->_print( "RN   [$t]\n");
+		# Having no RP line is legal, but we need both
+		# start and end for a valid location.
+		my $start = $ref->start;
+		my $end   = $ref->end;
+		if ($start and $end) {
+		    $self->_print( "RP   $start-$end\n");
+		} elsif ($start or $end) {
+		    $self->throw("Both start and end are needed for a valid RP line.  Got: start='$start' end='$end'");
+		}
+		if (my $med = $ref->medline) {
+		    $self->_print( "RX   MEDLINE; $med.\n");
+		}
+		if (my $pm = $ref->pubmed) {
+		    $self->_print( "RX   PUBMED; $pm.\n");
+		}
+		$self->_write_line_EMBL_regex("RA   ", "RA   ",
+					      $ref->authors . ";",
+					      '\s+|$', 80); #'
+		# If there is no title to the reference, it appears
+		# as a single semi-colon.  All titles must end in
+		# a semi-colon.
+		my $ref_title = $ref->title || '';
+		$ref_title =~ s/[\s;]*$/;/;
+		$self->_write_line_EMBL_regex("RT   ", "RT   ", $ref_title,    '\s+|$', 80); #'
+		$self->_write_line_EMBL_regex("RL   ", "RL   ", $ref->location, '\s+|$', 80); #'
+		if ($ref->comment) {
+		    $self->_write_line_EMBL_regex("RC   ", "RC   ", $ref->comment, '\s+|$', 80); #'
+		}
+		$self->_print("XX\n");
+		$t++;
+	    }
+	    # DB Xref lines
+	    if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
+		foreach my $dr (@db_xref) {
+		    my $db_name = $dr->database;
+		    my $prim    = $dr->primary_id;
+		    my $opt     = $dr->optional_id || '';
+		    my $line = "$db_name; $prim; $opt.";
+		    $self->_write_line_EMBL_regex("DR   ", "DR   ", $line, '\s+|$', 80); #'
+		}
+		$self->_print("XX\n");
+	    }
+	    # Comment lines
+	    foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
+		$self->_write_line_EMBL_regex("CC   ", "CC   ", $comment->text, '\s+|$', 80); #'
+		$self->_print("XX\n");
+	    }
+	}
+	# "\\s\+\|\$"
+	## FEATURE TABLE
+	$self->_print("FH   Key             Location/Qualifiers\n");
+	$self->_print("FH\n");
+	my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
+	if ($feats[0]) {
+	    if( defined $self->_post_sort ) {
+		# we need to read things into an array.
+		# Process. Sort them. Print 'em
+		my $post_sort_func = $self->_post_sort();
+		my @fth;
+		foreach my $sf ( @feats ) {
+		    push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
+		}
+		@fth = sort { &$post_sort_func($a,$b) } @fth;
+		foreach my $fth ( @fth ) {
+		    $self->_print_EMBL_FTHelper($fth);
+		}
+	    } else {
+		# not post sorted. And so we can print as we get them.
+		# lower memory load...
+		foreach my $sf ( @feats ) {
+		    my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
+		    foreach my $fth ( @fth ) {
+			if( $fth->key eq 'CONTIG') {
+			    $self->_show_dna(0);
+			}
+			$self->_print_EMBL_FTHelper($fth);
+		    }
+		}
+	    }
+	}
+	if( $self->_show_dna() == 0 ) {
+	    $self->_print( "//\n");
+	    return;
+	}
+	$self->_print( "XX\n");
+	# finished printing features.
+	$str =~ tr/A-Z/a-z/;
+	# Count each nucleotide
+	my $alen = $str =~ tr/a/a/;
+	my $clen = $str =~ tr/c/c/;
+	my $glen = $str =~ tr/g/g/;
+	my $tlen = $str =~ tr/t/t/;
+	my $olen = $len - ($alen + $tlen + $clen + $glen);
+	if( $olen < 0 ) {
+	    $self->warn("Weird. More atgc than bases. Problem!");
+	}
+	$self->_print("SQ   Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n");
+	my $nuc = 60;		# Number of nucleotides per line
+	my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
+	my $out_pat   = 'A11' x 6; # Pattern for packing a line
+	my $length = length($str);
+	# Calculate the number of nucleotides which fit on whole lines
+	my $whole = int($length / $nuc) * $nuc;
+	# Print the whole lines
+	my( $i );
+	for ($i = 0; $i < $whole; $i += $nuc) {
+	    my $blocks = pack $out_pat,
+	    unpack $whole_pat,
+	    substr($str, $i, $nuc);
+	    $self->_print(sprintf("     $blocks%9d\n", $i + $nuc));
+	}
+	# Print the last line
+	if (my $last = substr($str, $i)) {
+	    my $last_len = length($last);
+	    my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
+	    my $blocks = pack $out_pat,
+	    unpack($last_pat, $last);
+	    $self->_print(sprintf("     $blocks%9d\n", $length)); # Add the length to the end
+	}
+	$self->_print( "//\n");
+	$self->flush if $self->_flush_on_write && defined $self->_fh;
+	return 1;
+}
+}
+=head2 _print_EMBL_FTHelper
+Title   : _print_EMBL_FTHelper
+Usage   :
+Function: Internal function
+Returns :
+Args    :
+=cut
+sub _print_EMBL_FTHelper {
+my ($self,$fth,$always_quote) = @_;
+$always_quote ||= 0;
+if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
+$fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
+}
+#$self->_print( "FH   Key             Location/Qualifiers\n");
+#$self->_print( sprintf("FT   %-15s  %s\n",$fth->key,$fth->loc));
+# let
+if( $fth->key eq 'CONTIG' ) {
+$self->_print("XX\n");
+$self->_write_line_EMBL_regex("CO   ",
+				     "CO   ",$fth->loc,
+				     '\,|$',80); #'
+return;
+}
+$self->_write_line_EMBL_regex(sprintf("FT   %-15s ",$fth->key),
+				 "FT                   ",$fth->loc,
+				 '\,|$',80); #'
+foreach my $tag ( keys %{$fth->field} ) {
+if( ! defined $fth->field->{$tag} ) { next; }
+foreach my $value ( @{$fth->field->{$tag}} ) {
+	   $value =~ s/\"/\"\"/g;
+	   if ($value eq "_no_value") {
+	       $self->_write_line_EMBL_regex("FT                   ",
+					     "FT                   ",
+					     "/$tag",'.|$',80); #'
+	   }
+elsif( $always_quote == 1 || $value !~ /^\d+$/ ) {
+my $pat = $value =~ /\s/ ? '\s|$' : '.|$';
+	      $self->_write_line_EMBL_regex("FT                   ",
+					    "FT                   ",
+					    "/$tag=\"$value\"",$pat,80);
+}
+else {
+$self->_write_line_EMBL_regex("FT                   ",
+					    "FT                   ",
+					    "/$tag=$value",'.|$',80); #'
+}
+	  # $self->_print( "FT                   /", $tag, "=\"", $value, "\"\n");
+}
+}
+}
+#'
+=head2 _read_EMBL_References
+Title   : _read_EMBL_References
+Usage   :
+Function: Reads references from EMBL format. Internal function really
+Example :
+Returns :
+Args    :
+=cut
+sub _read_EMBL_References {
+my ($self,$buffer) = @_;
+my (@refs);
+# assumme things are starting with RN
+if( $$buffer !~ /^RN/ ) {
+warn("Not parsing line '$$buffer' which maybe important");
+}
+my $b1;
+my $b2;
+my $title;
+my $loc;
+my $au;
+my $med;
+my $pm;
+my $com;
+while( defined ($_ = $self->_readline) ) {
+/^R/ || last;
+/^RP   (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
+/^RX   MEDLINE;\s+(\d+)/ && do {$med=$1};
+/^RX   PUBMED;\s+(\d+)/ && do {$pm=$1};
+/^RA   (.*)/ && do {
+	   $au = $self->_concatenate_lines($au,$1); next;
+};
+/^RT   (.*)/ && do {
+	   $title = $self->_concatenate_lines($title,$1); next;
+};
+/^RL   (.*)/ && do {
+	   $loc = $self->_concatenate_lines($loc,$1); next;
+};
+/^RC   (.*)/ && do {
+	   $com = $self->_concatenate_lines($com,$1); next;
+};
+}
+my $ref = new Bio::Annotation::Reference;
+$au =~ s/;\s*$//g;
+$title =~ s/;\s*$//g;
+$ref->start($b1);
+$ref->end($b2);
+$ref->authors($au);
+$ref->title($title);
+$ref->location($loc);
+$ref->medline($med);
+$ref->comment($com);
+$ref->pubmed($pm);
+push(@refs,$ref);
+$$buffer = $_;
+return @refs;
+}
+=head2 _read_EMBL_Species
+Title   : _read_EMBL_Species
+Usage   :
+Function: Reads the EMBL Organism species and classification
+lines.
+Example :
+Returns : A Bio::Species object
+Args    : a reference to the current line buffer
+=cut
+sub _read_EMBL_Species {
+my( $self, $buffer ) = @_;
+my $org;
+$_ = $$buffer;
+my( $sub_species, $species, $genus, $common, @class );
+while (defined( $_ ||= $self->_readline )) {
+if (/^OS\s+(\S+)(?:\s+([^\(]\S*))?(?:\s+([^\(]\S*))?(?:\s+\((.*)\))?/) {
+$genus   = $1;
+	    $species = $2 || 'sp.';
+	    $sub_species = $3 if $3;
+$common      = $4 if $4;
+}
+elsif (s/^OC\s+//) {
+	    # only split on ';' or '.' so that
+	    # classification that is 2 words will
+	    # still get matched
+	    # use map to remove trailing/leading spaces
+	    chomp;
+push(@class,  map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/);
+}
+	elsif (/^OG\s+(.*)/) {
+	    $org = $1;
+	}
+else {
+last;
+}
+$_ = undef; # Empty $_ to trigger read of next line
+}
+$$buffer = $_;
+# Don't make a species object if it is "Unknown" or "None"
+return if $genus =~ /^(Unknown|None)$/i;
+# Bio::Species array needs array in Species -> Kingdom direction
+if ($class[$#class] eq $genus) {
+push( @class, $species );
+} else {
+push( @class, $genus, $species );
+}
+@class = reverse @class;
+my $make = Bio::Species->new();
+$make->classification( \@class, "FORCE" );  # no name validation please
+$make->common_name( $common      ) if $common;
+$make->sub_species( $sub_species ) if $sub_species;
+$make->organelle  ( $org         ) if $org;
+return $make;
+}
+=head2 _read_EMBL_DBLink
+Title   : _read_EMBL_DBLink
+Usage   :
+Function: Reads the EMBL database cross reference ("DR") lines
+Example :
+Returns : A list of Bio::Annotation::DBLink objects
+Args    :
+=cut
+sub _read_EMBL_DBLink {
+my( $self,$buffer ) = @_;
+my( @db_link );
+$_ = $$buffer;
+while (defined( $_ ||= $self->_readline )) {
+if (my($databse, $prim_id, $sec_id)
+= /^DR   ([^\s;]+);\s*([^\s;]+);\s*([^\s;]+)?\.$/) {
+my $link = Bio::Annotation::DBLink->new();
+$link->database   ( $databse );
+$link->primary_id ( $prim_id );
+$link->optional_id( $sec_id  ) if $sec_id;
+push(@db_link, $link);
+	}
+else {
+last;
+}
+$_ = undef; # Empty $_ to trigger read of next line
+}
+$$buffer = $_;
+return @db_link;
+}
+=head2 _filehandle
+Title   : _filehandle
+Usage   : $obj->_filehandle($newval)
+Function:
+Example :
+Returns : value of _filehandle
+Args    : newvalue (optional)
+=cut
+sub _filehandle{
+my ($obj,$value) = @_;
+if( defined $value) {
+$obj->{'_filehandle'} = $value;
+}
+return $obj->{'_filehandle'};
+}
+=head2 _read_FTHelper_EMBL
+Title   : _read_FTHelper_EMBL
+Usage   : _read_FTHelper_EMBL($buffer)
+Function: reads the next FT key line
+Example :
+Returns : Bio::SeqIO::FTHelper object
+Args    : filehandle and reference to a scalar
+=cut
+sub _read_FTHelper_EMBL {
+my ($self,$buffer) = @_;
+my ($key,   # The key of the feature
+$loc,   # The location line from the feature
+@qual,  # An arrray of lines making up the qualifiers
+	);
+if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/) {
+$key = $1;
+$loc = $2;
+# Read all the lines up to the next feature
+while ( defined($_ = $self->_readline) ) {
+if (/^FT(\s+)(.+?)\s*$/) {
+# Lines inside features are preceeded by 19 spaces
+# A new feature is preceeded by 3 spaces
+if (length($1) > 4) {
+# Add to qualifiers if we're in the qualifiers
+if (@qual) {
+push(@qual, $2);
+}
+# Start the qualifier list if it's the first qualifier
+elsif (substr($2, 0, 1) eq '/') {
+@qual = ($2);
+}
+# We're still in the location line, so append to location
+else {
+$loc .= $2;
+}
+} else {
+# We've reached the start of the next feature
+last;
+}
+} else {
+# We're at the end of the feature table
+last;
+}
+}
+} elsif( $$buffer =~ /^CO\s+(\S+)/) {
+	$key = 'CONTIG';
+	$loc = $1;
+	# Read all the lines up to the next feature
+	while ( defined($_ = $self->_readline) ) {
+	    if (/^CO\s+(\S+)\s*$/) {
+		$loc .= $1;
+	    } else {
+		# We've reached the start of the next feature
+		last;
+	    }
+	}
+} else {
+# No feature key
+return;
+}
+# Put the first line of the next feature into the buffer
+$$buffer = $_;
+# Make the new FTHelper object
+my $out = new Bio::SeqIO::FTHelper();
+$out->verbose($self->verbose());
+$out->key($key);
+$out->loc($loc);
+# Now parse and add any qualifiers.  (@qual is kept
+# intact to provide informative error messages.)
+QUAL: for (my $i = 0; $i < @qual; $i++) {
+$_ = $qual[$i];
+my( $qualifier, $value ) = m{^/([^=]+)(?:=(.+))?}
+or $self->throw("Can't see new qualifier in: $_\nfrom:\n"
+. join('', map "$_\n", @qual));
+if (defined $value) {
+# Do we have a quoted value?
+if (substr($value, 0, 1) eq '"') {
+# Keep adding to value until we find the trailing quote
+# and the quotes are balanced
+while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
+$i++;
+my $next = $qual[$i];
+unless (defined($next)) {
+warn("Unbalanced quote in:\n", map("$_\n", @qual),
+"No further qualifiers will be added for this feature");
+last QUAL;
+}
+# Join to value with space if value or next line contains a space
+$value .= (grep /\s/, ($value, $next)) ? " $next" : $next;
+}
+# Trim leading and trailing quotes
+$value =~ s/^"|"$//g;
+# Undouble internal quotes
+$value =~ s/""/"/g; #"
+}
+} else {
+$value = '_no_value';
+}
+# Store the qualifier
+$out->field->{$qualifier} ||= [];
+push(@{$out->field->{$qualifier}},$value);
+}
+return $out;
+}
+=head2 _write_line_EMBL
+Title   : _write_line_EMBL
+Usage   :
+Function: internal function
+Example :
+Returns :
+Args    :
+=cut
+sub _write_line_EMBL {
+my ($self,$pre1,$pre2,$line,$length) = @_;
+$length || die "Miscalled write_line_EMBL without length. Programming error!";
+my $subl = $length - length $pre2;
+my $linel = length $line;
+my $i;
+my $sub = substr($line,0,$length - length $pre1);
+$self->_print( "$pre1$sub\n");
+for($i= ($length - length $pre1);$i < $linel;) {
+$sub = substr($line,$i,($subl));
+$self->_print( "$pre2$sub\n");
+$i += $subl;
+}
+}
+=head2 _write_line_EMBL_regex
+Title   : _write_line_EMBL_regex
+Usage   :
+Function: internal function for writing lines of specified
+length, with different first and the next line
+left hand headers and split at specific points in the
+text
+Example :
+Returns : nothing
+Args    : file handle, first header, second header, text-line, regex for line breaks, total line length
+=cut
+sub _write_line_EMBL_regex {
+my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
+#print STDOUT "Going to print with $line!\n";
+$length || die "Programming error - called write_line_EMBL_regex without length.";
+my $subl = $length - (length $pre1) -1 ;
+my( @lines );
+while(defined $line &&
+	  $line =~ m/(.{1,$subl})($regex)/g) {
+	push(@lines, $1.$2);
+}
+foreach (@lines) { s/\s+$//; }
+# Print first line
+my $s = shift(@lines) || '';
+$self->_print( "$pre1$s\n");
+# Print the rest
+foreach my $s ( @lines ) {
+	$s = '' if( !defined $s );
+$self->_print( "$pre2$s\n");
+}
+}
+=head2 _post_sort
+Title   : _post_sort
+Usage   : $obj->_post_sort($newval)
+Function:
+Returns : value of _post_sort
+Args    : newvalue (optional)
+=cut
+sub _post_sort{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_post_sort'} = $value;
+}
+return $obj->{'_post_sort'};
+}
+=head2 _show_dna
+Title   : _show_dna
+Usage   : $obj->_show_dna($newval)
+Function:
+Returns : value of _show_dna
+Args    : newvalue (optional)
+=cut
+sub _show_dna{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_show_dna'} = $value;
+}
+return $obj->{'_show_dna'};
+}
+=head2 _id_generation_func
+Title   : _id_generation_func
+Usage   : $obj->_id_generation_func($newval)
+Function:
+Returns : value of _id_generation_func
+Args    : newvalue (optional)
+=cut
+sub _id_generation_func{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_id_generation_func'} = $value;
+}
+return $obj->{'_id_generation_func'};
+}
+=head2 _ac_generation_func
+Title   : _ac_generation_func
+Usage   : $obj->_ac_generation_func($newval)
+Function:
+Returns : value of _ac_generation_func
+Args    : newvalue (optional)
+=cut
+sub _ac_generation_func{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_ac_generation_func'} = $value;
+}
+return $obj->{'_ac_generation_func'};
+}
+=head2 _sv_generation_func
+Title   : _sv_generation_func
+Usage   : $obj->_sv_generation_func($newval)
+Function:
+Returns : value of _sv_generation_func
+Args    : newvalue (optional)
+=cut
+sub _sv_generation_func{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_sv_generation_func'} = $value;
+}
+return $obj->{'_sv_generation_func'};
+}
+=head2 _kw_generation_func
+Title   : _kw_generation_func
+Usage   : $obj->_kw_generation_func($newval)
+Function:
+Returns : value of _kw_generation_func
+Args    : newvalue (optional)
+=cut
+sub _kw_generation_func{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_kw_generation_func'} = $value;
+}
+return $obj->{'_kw_generation_func'};
+}
+1;

Mercurial > repos > mahtabm > ensemb_rep_gvl

comparison variant_effect_predictor/Bio/SeqIO/embl.pm @ 0:2bc9b66ada89 draft default tip