ensemb_rep_gvl: variant_effect_predictor/Bio/SeqIO/swiss.pm comparison

comparison variant_effect_predictor/Bio/SeqIO/swiss.pm @ 0:2bc9b66ada89 draft default tip

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 06:29:17 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:2bc9b66ada89
+# $Id: swiss.pm,v 1.66.2.4 2003/09/13 22:16:43 jason Exp $
+#
+# BioPerl module for Bio::SeqIO::swiss
+#
+# Cared for by Elia Stupka <elia@tll.org.sg>
+#
+# Copyright Elia Stupka
+#
+# You may distribute this module under the same terms as perl itself
+# POD documentation - main docs before the code
+=head1 NAME
+Bio::SeqIO::swiss - Swissprot sequence input/output stream
+=head1 SYNOPSIS
+It is probably best not to use this object directly, but
+rather go through the SeqIO handler system. Go:
+$stream = Bio::SeqIO->new(-file => $filename, -format => 'swiss');
+while ( my $seq = $stream->next_seq() ) {
+	# do something with $seq
+}
+=head1 DESCRIPTION
+This object can transform Bio::Seq objects to and from swissprot flat
+file databases.
+There is a lot of flexibility here about how to dump things which I need
+to document fully.
+=head2 Optional functions
+=over 3
+=item _show_dna()
+(output only) shows the dna or not
+=item _post_sort()
+(output only) provides a sorting func which is applied to the FTHelpers
+before printing
+=item _id_generation_func()
+This is function which is called as
+print "ID   ", $func($seq), "\n";
+To generate the ID line. If it is not there, it generates a sensible ID
+line using a number of tools.
+If you want to output annotations in swissprot format they need to be
+stored in a Bio::Annotation::Collection object which is accessible
+through the Bio::SeqI interface method L<annotation()|annotation>.
+The following are the names of the keys which are polled from a
+L<Bio::Annotation::Collection> object.
+reference       - Should contain Bio::Annotation::Reference objects
+comment         - Should contain Bio::Annotation::Comment objects
+dblink          - Should contain Bio::Annotation::DBLink objects
+gene_name       - Should contain Bio::Annotation::SimpleValue object
+=back
+=head1 FEEDBACK
+=head2 Mailing Lists
+User feedback is an integral part of the evolution of this
+and other Bioperl modules. Send your comments and suggestions preferably
+to one of the Bioperl mailing lists.
+Your participation is much appreciated.
+bioperl-l@bioperl.org          - General discussion
+http://bio.perl.org/MailList.html             - About the mailing lists
+=head2 Reporting Bugs
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.
+Bug reports can be submitted via email or the web:
+bioperl-bugs@bio.perl.org
+http://bugzilla.bioperl.org/
+=head1 AUTHOR - Elia Stupka
+Email elia@tll.org.sg
+Describe contact details here
+=head1 APPENDIX
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+=cut
+# Let the code begin...
+package Bio::SeqIO::swiss;
+use vars qw(@ISA);
+use strict;
+use Bio::SeqIO;
+use Bio::SeqIO::FTHelper;
+use Bio::SeqFeature::Generic;
+use Bio::Species;
+use Bio::Tools::SeqStats;
+use Bio::Seq::SeqFactory;
+use Bio::Annotation::Collection;
+use Bio::Annotation::Comment;
+use Bio::Annotation::Reference;
+use Bio::Annotation::DBLink;
+use Bio::Annotation::SimpleValue;
+use Bio::Annotation::StructuredValue;
+@ISA = qw(Bio::SeqIO);
+sub _initialize {
+my($self,@args) = @_;
+$self->SUPER::_initialize(@args);
+# hash for functions for decoding keys.
+$self->{'_func_ftunit_hash'} = {};
+$self->_show_dna(1); # sets this to one by default. People can change it
+if( ! defined $self->sequence_factory ) {
+$self->sequence_factory(new Bio::Seq::SeqFactory
+			      (-verbose => $self->verbose(),
+			       -type => 'Bio::Seq::RichSeq'));
+}
+}
+=head2 next_seq
+Title   : next_seq
+Usage   : $seq = $stream->next_seq()
+Function: returns the next sequence in the stream
+Returns : Bio::Seq object
+Args    :
+=cut
+sub next_seq {
+my ($self,@args) = @_;
+my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
+$date,$comment,@date_arr);
+my $genename = "";
+my ($annotation, %params, @features) = ( new Bio::Annotation::Collection);
+$line = $self->_readline;
+if( !defined $line) {
+return undef; # no throws - end of file
+}
+if( $line =~ /^\s+$/ ) {
+while( defined ($line = $self->_readline) ) {
+	   $line =~ /\S/ && last;
+}
+}
+if( !defined $line ) {
+return undef; # end of file
+}
+# fixed to allow _DIVISION to be optional for bug #946
+# see bug report for more information
+$line =~ /^ID\s+([^\s_]+)(_([^\s_]+))?\s+([^\s;]+);\s+([^\s;]+);/
+|| $self->throw("swissprot stream with no ID. Not swissprot in my book");
+if( $3 ) {
+$name = "$1$2";
+$params{'-division'} = $3;
+} else {
+$name = $1;
+$params{'-division'} = 'UNK';
+$params{'-primary_id'} = $1;
+}
+$params{'-alphabet'} = 'protein';
+# this is important to have the id for display in e.g. FTHelper, otherwise
+# you won't know which entry caused an error
+$params{'-display_id'} = $name;
+my $buffer = $line;
+BEFORE_FEATURE_TABLE :
+until( !defined ($buffer) ) {
+$_ = $buffer;
+# Exit at start of Feature table
+last if /^FT/;
+# and at the sequence at the latest HL 05/11/2000
+last if /^SQ/;
+# Description line(s)
+if (/^DE\s+(\S.*\S)/) {
+$desc .= $desc ? " $1" : $1;
+}
+#Gene name
+elsif(/^GN\s+(.*)/) {
+	   $genename .= " " if $genename;
+	   $genename .= $1;
+	   # has GN terminated yet?
+	   if($genename =~ s/[\. ]+$//) {
+	       my $gn = Bio::Annotation::StructuredValue->new();
+	       foreach my $gene (split(/ AND /, $genename)) {
+		   $gene =~ s/^\(//;
+		   $gene =~ s/\)$//;
+		   $gn->add_value([-1,-1], split(/ OR /, $gene));
+}
+	       $annotation->add_Annotation('gene_name',$gn,
+					   "Bio::Annotation::SimpleValue");
+}
+}
+#accession number(s)
+elsif( /^AC\s+(.+)/) {
+	   my @accs = split(/[; ]+/, $1); # allow space in addition
+	   $params{'-accession_number'} = shift @accs
+	       unless defined $params{'-accession_number'};
+	   push @{$params{'-secondary_accessions'}}, @accs;
+}
+#version number
+elsif( /^SV\s+(\S+);?/ ) {
+	   my $sv = $1;
+	   $sv =~ s/\;//;
+	   $params{'-seq_version'} = $sv;
+}
+#date
+elsif( /^DT\s+(.*)/ ) {
+	   my $date = $1;
+	   $date =~ s/\;//;
+	   $date =~ s/\s+$//;
+	   push @{$params{'-dates'}}, $date;
+}
+# Organism name and phylogenetic information
+elsif (/^O[SCG]/) {
+my $species = $self->_read_swissprot_Species(\$buffer);
+$params{'-species'}= $species;
+	   # now we are one line ahead -- so continue without reading the next
+	   # line   HL 05/11/2000
+	   next;
+}
+# References
+elsif (/^R/) {
+	   my $refs = $self->_read_swissprot_References(\$buffer);
+	   foreach my $r (@$refs) {
+	       $annotation->add_Annotation('reference',$r);
+	   }
+	   # now we are one line ahead -- so continue without reading the next
+	   # line   HL 05/11/2000
+	   next;
+}
+#Comments
+elsif (/^CC\s{3}(.*)/) {
+	   $comment .= $1;
+	   $comment .= "\n";
+	   while (defined ($buffer = $self->_readline)) {
+	       if ($buffer =~ /^CC\s{3}(.*)/) {
+		   $comment .= $1;
+		   $comment .= "\n";
+	       }
+	       else {
+		   last;
+	       }
+	   }
+	   my $commobj = Bio::Annotation::Comment->new();
+	   # note: don't try to process comments here -- they may contain
+# structure. LP 07/30/2000
+	   $commobj->text($comment);
+	   $annotation->add_Annotation('comment',$commobj);
+	   $comment = "";
+	   # now we are one line ahead -- so continue without reading the next
+	   # line   HL 05/11/2000
+	   next;
+}
+#DBLinks
+elsif (/^DR\s+(\S+)\;\s+(\S+)\;\s+(\S+)[\;\.](.*)$/) {
+	   my $dblinkobj =  Bio::Annotation::DBLink->new();
+	   $dblinkobj->database($1);
+	   $dblinkobj->primary_id($2);
+	   $dblinkobj->optional_id($3);
+	   my $comment = $4;
+	   if(length($comment) > 0) {
+	       # edit comment to get rid of leading space and trailing dot
+	       if( $comment =~ /^\s*(\S+)\./ ) {
+		   $dblinkobj->comment($1);
+	       } else {
+		   $dblinkobj->comment($comment);
+	       }
+	   }
+	   $annotation->add_Annotation('dblink',$dblinkobj);
+}
+#keywords
+elsif( /^KW\s+(.*)$/ ) {
+	   my @kw = split(/\s*\;\s*/,$1);
+	   defined $kw[-1] && $kw[-1] =~ s/\.$//;
+	   push @{$params{'-keywords'}}, @kw;
+}
+# Get next line. Getting here assumes that we indeed need to read the
+# line.
+$buffer = $self->_readline;
+}
+$buffer = $_;
+FEATURE_TABLE :
+# if there is no feature table, or if we've got beyond, exit loop or don't
+# even enter    HL 05/11/2000
+while (defined ($buffer) && ($buffer =~ /^FT/)) {
+my $ftunit = $self->_read_FTHelper_swissprot(\$buffer);
+# process ftunit
+# when parsing of the line fails we get undef returned
+if($ftunit) {
+	   push(@features,
+		$ftunit->_generic_seqfeature($self->location_factory(),
+					     $params{'-seqid'}, "SwissProt"));
+} else {
+	   $self->warn("failed to parse feature table line for seq " .
+		       $params{'-display_id'});
+}
+}
+if( $buffer !~ /^SQ/  ) {
+while( defined($_ = $self->_readline) ) {
+	   /^SQ/ && last;
+}
+}
+$seqc = "";
+while( defined ($_ = $self->_readline) ) {
+/^\/\// && last;
+$_ = uc($_);
+s/[^A-Za-z]//g;
+$seqc .= $_;
+}
+my $seq=  $self->sequence_factory->create
+(-verbose  => $self->verbose,
+	%params,
+	-seq      => $seqc,
+	-desc     => $desc,
+	-features => \@features,
+	-annotation => $annotation,
+	);
+# The annotation doesn't get added by the contructor
+$seq->annotation($annotation);
+return $seq;
+}
+=head2 write_seq
+Title   : write_seq
+Usage   : $stream->write_seq($seq)
+Function: writes the $seq object (must be seq) to the stream
+Returns : 1 for success and 0 for error
+Args    : array of 1 to n Bio::SeqI objects
+=cut
+sub write_seq {
+my ($self,@seqs) = @_;
+foreach my $seq ( @seqs ) {
+	$self->throw("Attempting to write with no seq!") unless defined $seq;
+	if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
+	    $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
+	}
+	my $i;
+	my $str = $seq->seq;
+	my $mol;
+	my $div;
+	my $len = $seq->length();
+	if ( !$seq->can('division') || ! defined ($div = $seq->division()) ) {
+	    $div = 'UNK';
+	}
+	if( ! $seq->can('alphabet') || ! defined ($mol = $seq->alphabet) ) {
+	    $mol = 'XXX';
+	}
+	my $temp_line;
+	if( $self->_id_generation_func ) {
+	    $temp_line = &{$self->_id_generation_func}($seq);
+	} else {
+	    #$temp_line = sprintf ("%10s     STANDARD;      %3s;   %d AA.",
+	    #		     $seq->primary_id()."_".$div,$mol,$len);
+	    # Reconstructing the ID relies heavily upon the input source having
+	    # been in a format that is parsed as this routine expects it -- that is,
+	    # by this module itself. This is bad, I think, and immediately breaks
+	    # if e.g. the Bio::DB::GenPept module is used as input.
+	    # Hence, switch to display_id(); _every_ sequence is supposed to have
+	    # this. HL 2000/09/03
+	    $mol =~ s/protein/PRT/;
+	    $temp_line = sprintf ("%10s     STANDARD;      %3s;   %d AA.",
+				  $seq->display_id(), $mol, $len);
+	}
+	$self->_print( "ID   $temp_line\n");
+	# if there, write the accession line
+	local($^W) = 0;	# supressing warnings about uninitialized fields
+	if( $self->_ac_generation_func ) {
+	    $temp_line = &{$self->_ac_generation_func}($seq);
+	    $self->_print( "AC   $temp_line\n");
+	} else {
+	    if ($seq->can('accession_number') ) {
+		$self->_print("AC   ",$seq->accession_number,";");
+		if ($seq->can('get_secondary_accessions') ) {
+		    foreach my $sacc ($seq->get_secondary_accessions) {
+			$self->_print(" ",$sacc,";");
+		    }
+		    $self->_print("\n");
+		}
+		else {
+		    $self->_print("\n");
+		}
+	    }
+	    # otherwise - cannot print <sigh>
+	}
+	# Date lines
+	if( $seq->can('get_dates') ) {
+	    foreach my $dt ( $seq->get_dates() ) {
+		$self->_write_line_swissprot_regex("DT   ","DT   ",
+						   $dt,"\\s\+\|\$",80);
+	    }
+	}
+	#Definition lines
+	$self->_write_line_swissprot_regex("DE   ","DE   ",$seq->desc(),"\\s\+\|\$",80);
+	#Gene name
+	if ((my @genes = $seq->annotation->get_Annotations('gene_name') ) ) {
+	    $self->_print("GN   ",
+			  join(' OR ',
+			       map {
+				   $_->isa("Bio::Annotation::StructuredValue") ?
+				       $_->value(-joins => [" AND ", " OR "]) :
+				       $_->value();
+			       } @genes),
+			  ".\n");
+	}
+	# Organism lines
+	if ($seq->can('species') && (my $spec = $seq->species)) {
+	    my($species, @class) = $spec->classification();
+	    my $genus = $class[0];
+	    my $OS = "$genus $species";
+	    if ($class[$#class] =~ /viruses/i) {
+		# different OS / OC syntax for viruses LP 09/16/2000
+		shift @class;
+	    }
+	    if (my $ssp = $spec->sub_species) {
+		$OS .= " $ssp";
+	    }
+	    foreach (($spec->variant, $spec->common_name)) {
+		$OS .= " ($_)" if $_;
+	    }
+	    $self->_print( "OS   $OS.\n");
+	    my $OC = join('; ', reverse(@class)) .'.';
+	    $self->_write_line_swissprot_regex("OC   ","OC   ",$OC,"\; \|\$",80);
+	    if ($spec->organelle) {
+		$self->_write_line_swissprot_regex("OG   ","OG   ",$spec->organelle,"\; \|\$",80);
+	    }
+	    if ($spec->ncbi_taxid) {
+		$self->_print("OX   NCBI_TaxID=".$spec->ncbi_taxid.";\n");
+	    }
+	}
+	# Reference lines
+	my $t = 1;
+	foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
+	    $self->_print( "RN   [$t]\n");
+	    # changed by lorenz 08/03/00
+	    # j.gilbert and h.lapp agreed that the rp line in swissprot seems
+	    # more like a comment than a parseable value, so print it as is
+	    if ($ref->rp) {
+		$self->_write_line_swissprot_regex("RP   ","RP   ",$ref->rp,
+						   "\\s\+\|\$",80);
+	    }
+	    if ($ref->comment) {
+		$self->_write_line_swissprot_regex("RC   ","RC   ",$ref->comment,
+						   "\\s\+\|\$",80);
+	    }
+	    if ($ref->medline) {
+		# new RX format in swissprot LP 09/17/00
+		if ($ref->pubmed) {
+		    $self->_write_line_swissprot_regex("RX   ","RX   ",
+						       "MEDLINE=".$ref->medline.
+						       "; PubMed=".$ref->pubmed.";",
+						       "\\s\+\|\$",80);
+		} else {
+		    $self->_write_line_swissprot_regex("RX   MEDLINE; ","RX   MEDLINE; ",
+						       $ref->medline.".","\\s\+\|\$",80);
+		}
+	    }
+	    my $author = $ref->authors .';' if($ref->authors);
+	    my $title = $ref->title .';' if( $ref->title);
+	    $self->_write_line_swissprot_regex("RA   ","RA   ",$author,"\\s\+\|\$",80);
+	    $self->_write_line_swissprot_regex("RT   ","RT   ",$title,"\\s\+\|\$",80);
+	    $self->_write_line_swissprot_regex("RL   ","RL   ",$ref->location,"\\s\+\|\$",80);
+	    $t++;
+	}
+	# Comment lines
+	foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
+	    foreach my $cline (split ("\n", $comment->text)) {
+		while (length $cline > 74) {
+		    $self->_print("CC   ",(substr $cline,0,74),"\n");
+		    $cline = substr $cline,74;
+		}
+		$self->_print("CC   ",$cline,"\n");
+	    }
+	}
+	foreach my $dblink ( $seq->annotation->get_Annotations('dblink') )
+	{
+	    if (defined($dblink->comment)&&($dblink->comment)) {
+		$self->_print("DR   ",$dblink->database,"; ",$dblink->primary_id,"; ",
+			      $dblink->optional_id,"; ",$dblink->comment,".\n");
+	    } elsif($dblink->optional_id) {
+		$self->_print("DR   ",$dblink->database,"; ",
+			      $dblink->primary_id,"; ",
+			      $dblink->optional_id,".\n");
+	    }
+	    else {
+		$self->_print("DR   ",$dblink->database,
+			      "; ",$dblink->primary_id,"; ","-.\n");
+	    }
+	}
+	# if there, write the kw line
+	{
+	    my( $kw );
+	    if( my $func = $self->_kw_generation_func ) {
+		$kw = &{$func}($seq);
+	    } elsif( $seq->can('keywords') ) {
+		$kw = $seq->keywords;
+		if( ref($kw) =~ /ARRAY/i ) {
+		    $kw = join("; ", @$kw);
+		}
+		$kw .= '.' if( $kw !~ /\.$/ );
+	    }
+	    $self->_write_line_swissprot_regex("KW   ","KW   ",
+					       $kw, "\\s\+\|\$",80);
+	}
+#Check if there are seqfeatures before printing the FT line
+my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
+	if ($feats[0]) {
+	    if( defined $self->_post_sort ) {
+		# we need to read things into an array. Process. Sort them. Print 'em
+		my $post_sort_func = $self->_post_sort();
+		my @fth;
+		foreach my $sf ( @feats ) {
+		    push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
+		}
+		@fth = sort { &$post_sort_func($a,$b) } @fth;
+		foreach my $fth ( @fth ) {
+		    $self->_print_swissprot_FTHelper($fth);
+		}
+	    } else {
+		# not post sorted. And so we can print as we get them.
+		# lower memory load...
+		foreach my $sf ( @feats ) {
+		    my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
+		    foreach my $fth ( @fth ) {
+			if( ! $fth->isa('Bio::SeqIO::FTHelper') ) {
+			    $sf->throw("Cannot process FTHelper... $fth");
+			}
+			$self->_print_swissprot_FTHelper($fth);
+		    }
+		}
+	    }
+	    if( $self->_show_dna() == 0 ) {
+		return;
+	    }
+	}
+	# finished printing features.
+	# molecular weight
+	my $mw = ${Bio::Tools::SeqStats->get_mol_wt($seq->primary_seq)}[0];
+	# checksum
+	# was crc32 checksum, changed it to crc64
+	my $crc64 = $self->_crc64(\$str);
+	$self->_print( sprintf("SQ   SEQUENCE  %4d AA;  %d MW;  %16s CRC64;\n",
+			       $len,$mw,$crc64));
+	$self->_print( "     ");
+	my $linepos;
+	for ($i = 0; $i < length($str); $i += 10) {
+	    $self->_print( substr($str,$i,10), " ");
+	    $linepos += 11;
+	    if( ($i+10)%60 == 0 && (($i+10) < length($str))) {
+		$self->_print( "\n     ");
+	    }
+	}
+	$self->_print( "\n//\n");
+	$self->flush if $self->_flush_on_write && defined $self->_fh;
+	return 1;
+}
+}
+# Thanks to James Gilbert for the following two. LP 08/01/2000
+=head2 _generateCRCTable
+Title   : _generateCRCTable
+Usage   :
+Function:
+Example :
+Returns :
+Args    :
+=cut
+sub _generateCRCTable {
+# 10001000001010010010001110000100
+# 32
+my $poly = 0xEDB88320;
+my ($self) = shift;
+$self->{'_crcTable'} = [];
+foreach my $i (0..255) {
+my $crc = $i;
+for (my $j=8; $j > 0; $j--) {
+if ($crc & 1) {
+	$crc = ($crc >> 1) ^ $poly;
+}
+else {
+	$crc >>= 1;
+}
+}
+${$self->{'_crcTable'}}[$i] = $crc;
+}
+}
+=head2 _crc32
+Title   : _crc32
+Usage   :
+Function:
+Example :
+Returns :
+Args    :
+=cut
+sub _crc32 {
+my( $self, $str ) = @_;
+$self->throw("Argument to crc32() must be ref to scalar")
+unless ref($str) eq 'SCALAR';
+$self->_generateCRCTable() unless exists $self->{'_crcTable'};
+my $len = length($$str);
+my $crc = 0xFFFFFFFF;
+for (my $i = 0; $i < $len; $i++) {
+# Get upper case value of each letter
+my $int = ord uc substr $$str, $i, 1;
+$crc = (($crc >> 8) & 0x00FFFFFF) ^
+${$self->{'_crcTable'}}[ ($crc ^ $int) & 0xFF ];
+}
+return $crc;
+}
+=head2 _crc64
+Title   : _crc64
+Usage   :
+Function:
+Example :
+Returns :
+Args    :
+=cut
+sub _crc64{
+my ($self, $sequence) = @_;
+my $POLY64REVh = 0xd8000000;
+my @CRCTableh = 256;
+my @CRCTablel = 256;
+my $initialized;
+my $seq = $$sequence;
+my $crcl = 0;
+my $crch = 0;
+if (!$initialized) {
+	$initialized = 1;
+	for (my $i=0; $i<256; $i++) {
+	    my $partl = $i;
+	    my $parth = 0;
+	    for (my $j=0; $j<8; $j++) {
+		my $rflag = $partl & 1;
+		$partl >>= 1;
+		$partl |= (1 << 31) if $parth & 1;
+		$parth >>= 1;
+		$parth ^= $POLY64REVh if $rflag;
+	    }
+	    $CRCTableh[$i] = $parth;
+	    $CRCTablel[$i] = $partl;
+	}
+}
+foreach (split '', $seq) {
+	my $shr = ($crch & 0xFF) << 24;
+	my $temp1h = $crch >> 8;
+	my $temp1l = ($crcl >> 8) | $shr;
+	my $tableindex = ($crcl ^ (unpack "C", $_)) & 0xFF;
+	$crch = $temp1h ^ $CRCTableh[$tableindex];
+	$crcl = $temp1l ^ $CRCTablel[$tableindex];
+}
+my $crc64 = sprintf("%08X%08X", $crch, $crcl);
+return $crc64;
+}
+=head2 _print_swissprot_FTHelper
+Title   : _print_swissprot_FTHelper
+Usage   :
+Function:
+Example :
+Returns :
+Args    :
+=cut
+sub _print_swissprot_FTHelper {
+my ($self,$fth,$always_quote) = @_;
+$always_quote ||= 0;
+my ($start,$end) = ('?', '?');
+if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
+$fth->warn("$fth is not a FTHelper class. ".
+		  "Attempting to print, but there could be tears!");
+}
+if( $fth->loc =~ /(\?|\d+|\>\d+|<\d+)?\.\.(\?|\d+|<\d+|>\d+)?/ ) {
+$start = $1 if defined $1;
+$end = $2 if defined $2;
+# to_FTString only returns one value when start == end, #JB955
+# so if no match is found, assume it is both start and end #JB955
+} else {
+$start = $end = $fth->loc;
+}
+my $desc = "";
+$desc = @{$fth->field->{"description"}}[0]."."
+if exists $fth->field->{"description"};
+$self->_write_line_swissprot_regex(sprintf("FT   %-8s %6s %6s       ",
+					      substr($fth->key,0,8),
+					      $start,$end),
+				      "FT                                ",
+				      $desc.'.','\s+|$',80);
+}
+#'
+=head2 _read_swissprot_References
+Title   : _read_swissprot_References
+Usage   :
+Function: Reads references from swissprot format. Internal function really
+Example :
+Returns :
+Args    :
+=cut
+sub _read_swissprot_References{
+my ($self,$buffer) = @_;
+my (@refs);
+my ($b1, $b2, $rp, $title, $loc, $au, $med, $com, $pubmed);
+if ($$buffer !~ /^RP/) {
+$$buffer = $self->_readline;
+}
+if( !defined $$buffer ) { return undef; }
+if( $$buffer =~ /^RP/ ) {
+if ($$buffer =~ /^RP   (SEQUENCE OF (\d+)-(\d+).*)/) {
+	   $rp=$1;
+	   $b1=$2;
+	   $b2=$3;
+}
+elsif ($$buffer =~ /^RP   (.*)/) {
+	   $rp=$1;
+}
+}
+while( defined ($_ = $self->_readline) ) {
+#/^CC/ && last;
+/^RN/ && last; # separator between references ! LP 07/25/2000
+#/^SQ/ && last; # there may be sequences without CC lines! HL 05/11/2000
+/^[^R]/ && last; # may be the safest exit point HL 05/11/2000
+/^RX   MEDLINE;\s+(\d+)/ && do {$med=$1};
+/^RX   MEDLINE=(\d+);\s+PubMed=(\d+);/ && do {$med=$1;$pubmed=$2};
+/^RA   (.*)/ && do { $au .= $au ? " $1" : $1;   next;};
+/^RT   (.*)/ && do { $title .= $title ? " $1" : $1; next;};
+/^RL   (.*)/ && do { $loc .= $loc ? " $1" : $1; next;};
+/^RC   (.*)/ && do { $com .= $com ? " $1" : $1; next;};
+}
+my $ref = new Bio::Annotation::Reference;
+$au =~ s/;\s*$//g;
+if( defined $title ) {
+$title =~ s/;\s*$//g;
+}
+$ref->start($b1);
+$ref->end($b2);
+$ref->authors($au);
+$ref->title($title);
+$ref->location($loc);
+$ref->medline($med);
+$ref->pubmed($pubmed) if (defined $pubmed);
+$ref->comment($com);
+$ref->rp($rp);
+push(@refs,$ref);
+$$buffer = $_;
+return \@refs;
+}
+=head2 _read_swissprot_Species
+Title   : _read_swissprot_Species
+Usage   :
+Function: Reads the swissprot Organism species and classification
+lines.
+Example :
+Returns : A Bio::Species object
+Args    :
+=cut
+sub _read_swissprot_Species {
+my( $self, $buffer ) = @_;
+my $org;
+$_ = $$buffer;
+my( $subspecies, $species, $genus, $common, $variant, $ncbi_taxid );
+my @class;
+my ($binomial, $descr);
+my $osline = "";
+while (defined( $_ ||= $self->_readline )) {
+	last unless /^O[SCGX]/;
+	# believe it or not, but OS may come multiple times -- at this time
+	# we can't capture multiple species
+	if(/^OS\s+(\S.+)/ && (! defined($binomial))) {
+	    $osline .= " " if $osline;
+	    $osline .= $1;
+	    if($osline =~ s/(,|, and|\.)$//) {
+		($binomial, $descr) = $osline =~ /(\S[^\(]+)(.*)/;
+		($genus, $species, $subspecies) = split(/\s+/, $binomial);
+		$species = "sp." unless $species;
+		while($descr =~ /\(([^\)]+)\)/g) {
+		    my $item = $1;
+		    # strain etc may not necessarily come first (yes, swissprot
+		    # is messy)
+		    if((! defined($variant)) &&
+		       (($item =~ /(^|[^\(\w])([Ss]train|isolate|serogroup|serotype|subtype|clone)\b/) ||
+			($item =~ /^(biovar|pv\.|type\s+)/))) {
+			$variant = $item;
+		    } elsif($item =~ s/^subsp\.\s+//) {
+			if(! $subspecies) {
+			    $subspecies = $item;
+			} elsif(! $variant) {
+			    $variant = $item;
+			}
+		    } elsif(! defined($common)) {
+			# we're only interested in the first common name
+			$common = $item;
+			if((index($common, '(') >= 0) &&
+			   (index($common, ')') < 0)) {
+			    $common .= ')';
+			}
+		    }
+		}
+	    }
+}
+elsif (s/^OC\s+//) {
+push(@class, split /[\;\.]\s*/);
+	    if($class[0] =~ /viruses/i) {
+		# viruses have different OS/OC syntax
+		my @virusnames = split(/\s+/, $binomial);
+		$species = (@virusnames > 1) ? pop(@virusnames) : '';
+		$genus = join(" ", @virusnames);
+		$subspecies = undef;
+	    }
+}
+	elsif (/^OG\s+(.*)/) {
+	    $org = $1;
+	}
+	elsif (/^OX\s+(.*)/ && (! defined($ncbi_taxid))) {
+my $taxstring = $1;
+	    # we only keep the first one and ignore all others
+if ($taxstring =~ /NCBI_TaxID=([\w\d]+)/) {
+$ncbi_taxid = $1;
+} else {
+$self->throw("$taxstring doesn't look like NCBI_TaxID");
+}
+	}
+$_ = undef; # Empty $_ to trigger read of next line
+}
+$$buffer = $_;
+# Don't make a species object if it is "Unknown" or "None"
+return if $genus =~ /^(Unknown|None)$/i;
+if ($class[$#class] eq $genus) {
+push( @class, $species );
+} else {
+push( @class, $genus, $species );
+}
+@class = reverse @class;
+my $taxon = Bio::Species->new();
+$taxon->classification( \@class, "FORCE" ); # no name validation please
+$taxon->common_name( $common      ) if $common;
+$taxon->sub_species( $subspecies ) if $subspecies;
+$taxon->organelle  ( $org         ) if $org;
+$taxon->ncbi_taxid ( $ncbi_taxid  ) if $ncbi_taxid;
+$taxon->variant($variant)           if $variant;
+# done
+return $taxon;
+}
+=head2 _filehandle
+Title   : _filehandle
+Usage   : $obj->_filehandle($newval)
+Function:
+Example :
+Returns : value of _filehandle
+Args    : newvalue (optional)
+=cut
+# inherited from SeqIO.pm ! HL 05/11/2000
+=head2 _read_FTHelper_swissprot
+Title   : _read_FTHelper_swissprot
+Usage   : _read_FTHelper_swissprot(\$buffer)
+Function: reads the next FT key line
+Example :
+Returns : Bio::SeqIO::FTHelper object
+Args    : filehandle and reference to a scalar
+=cut
+sub _read_FTHelper_swissprot {
+# initial version implemented by HL 05/10/2000
+# FIXME this may not be perfect, so please review
+my ($self,$buffer) = @_;
+my ($key,   # The key of the feature
+$loc,   # The location line from the feature
+$desc,  # The descriptive text
+);
+if ($$buffer =~ /^FT   (\w+)\s+([\d\?\<]+)\s+([\d\?\>]+)\s*(.*)$/) {
+$key = $1;
+my $loc1 = $2;
+my $loc2 = $3;
+	$loc = "$loc1..$loc2";
+	if($4 && (length($4) > 0)) {
+	    $desc = $4;
+	    chomp($desc);
+	} else {
+	    $desc = "";
+	}
+	# Read all the continuation lines up to the next feature
+	while (defined($_ = $self->_readline) && /^FT\s{20,}(\S.*)$/) {
+	    $desc .= $1;
+	    chomp($desc);
+	}
+	$desc =~ s/\.$//;
+} else {
+# No feature key. What's this?
+	$self->warn("No feature key in putative feature table line: $_");
+return;
+}
+# Put the first line of the next feature into the buffer
+$$buffer = $_;
+# Make the new FTHelper object
+my $out = new Bio::SeqIO::FTHelper(-verbose => $self->verbose());
+$out->key($key);
+$out->loc($loc);
+# store the description if there is one
+if($desc && (length($desc) > 0)) {
+	$out->field->{"description"} ||= [];
+	push(@{$out->field->{"description"}}, $desc);
+}
+return $out;
+}
+=head2 _write_line_swissprot
+Title   : _write_line_swissprot
+Usage   :
+Function: internal function
+Example :
+Returns :
+Args    :
+=cut
+sub _write_line_swissprot{
+my ($self,$pre1,$pre2,$line,$length) = @_;
+$length || die "Miscalled write_line_swissprot without length. Programming error!";
+my $subl = $length - length $pre2;
+my $linel = length $line;
+my $i;
+my $sub = substr($line,0,$length - length $pre1);
+$self->_print( "$pre1$sub\n");
+for($i= ($length - length $pre1);$i < $linel;) {
+$sub = substr($line,$i,($subl));
+$self->_print( "$pre2$sub\n");
+$i += $subl;
+}
+}
+=head2 _write_line_swissprot_regex
+Title   : _write_line_swissprot_regex
+Usage   :
+Function: internal function for writing lines of specified
+length, with different first and the next line
+left hand headers and split at specific points in the
+text
+Example :
+Returns : nothing
+Args    : file handle, first header, second header, text-line, regex for line breaks, total line length
+=cut
+sub _write_line_swissprot_regex {
+my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
+#print STDOUT "Going to print with $line!\n";
+$length || die "Miscalled write_line_swissprot without length. Programming error!";
+if( length $pre1 != length $pre2 ) {
+print STDERR "len 1 is ", length $pre1, " len 2 is ", length $pre2, "\n";
+die "Programming error - cannot called write_line_swissprot_regex with different length \npre1 ($pre1) and \npre2 ($pre2) tags!";
+}
+my $subl = $length - (length $pre1) -1 ;
+my @lines;
+while($line =~ m/(.{1,$subl})($regex)/g) {
+push(@lines, $1.$2);
+}
+my $s = shift @lines;
+$self->_print( "$pre1$s\n");
+foreach my $s ( @lines ) {
+$self->_print( "$pre2$s\n");
+}
+}
+=head2 _post_sort
+Title   : _post_sort
+Usage   : $obj->_post_sort($newval)
+Function:
+Returns : value of _post_sort
+Args    : newvalue (optional)
+=cut
+sub _post_sort{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_post_sort'} = $value;
+}
+return $obj->{'_post_sort'};
+}
+=head2 _show_dna
+Title   : _show_dna
+Usage   : $obj->_show_dna($newval)
+Function:
+Returns : value of _show_dna
+Args    : newvalue (optional)
+=cut
+sub _show_dna{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_show_dna'} = $value;
+}
+return $obj->{'_show_dna'};
+}
+=head2 _id_generation_func
+Title   : _id_generation_func
+Usage   : $obj->_id_generation_func($newval)
+Function:
+Returns : value of _id_generation_func
+Args    : newvalue (optional)
+=cut
+sub _id_generation_func{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_id_generation_func'} = $value;
+}
+return $obj->{'_id_generation_func'};
+}
+=head2 _ac_generation_func
+Title   : _ac_generation_func
+Usage   : $obj->_ac_generation_func($newval)
+Function:
+Returns : value of _ac_generation_func
+Args    : newvalue (optional)
+=cut
+sub _ac_generation_func{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_ac_generation_func'} = $value;
+}
+return $obj->{'_ac_generation_func'};
+}
+=head2 _sv_generation_func
+Title   : _sv_generation_func
+Usage   : $obj->_sv_generation_func($newval)
+Function:
+Returns : value of _sv_generation_func
+Args    : newvalue (optional)
+=cut
+sub _sv_generation_func{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_sv_generation_func'} = $value;
+}
+return $obj->{'_sv_generation_func'};
+}
+=head2 _kw_generation_func
+Title   : _kw_generation_func
+Usage   : $obj->_kw_generation_func($newval)
+Function:
+Returns : value of _kw_generation_func
+Args    : newvalue (optional)
+=cut
+sub _kw_generation_func{
+my $obj = shift;
+if( @_ ) {
+my $value = shift;
+$obj->{'_kw_generation_func'} = $value;
+}
+return $obj->{'_kw_generation_func'};
+}
+1;

Mercurial > repos > mahtabm > ensemb_rep_gvl

comparison variant_effect_predictor/Bio/SeqIO/swiss.pm @ 0:2bc9b66ada89 draft default tip