Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/SeqIO/swiss.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/swiss.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1251 @@ +# $Id: swiss.pm,v 1.66.2.4 2003/09/13 22:16:43 jason Exp $ +# +# BioPerl module for Bio::SeqIO::swiss +# +# Cared for by Elia Stupka <elia@tll.org.sg> +# +# Copyright Elia Stupka +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::swiss - Swissprot sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SeqIO handler system. Go: + + $stream = Bio::SeqIO->new(-file => $filename, -format => 'swiss'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from swissprot flat +file databases. + +There is a lot of flexibility here about how to dump things which I need +to document fully. + + +=head2 Optional functions + +=over 3 + +=item _show_dna() + +(output only) shows the dna or not + +=item _post_sort() + +(output only) provides a sorting func which is applied to the FTHelpers +before printing + +=item _id_generation_func() + +This is function which is called as + + print "ID ", $func($seq), "\n"; + +To generate the ID line. If it is not there, it generates a sensible ID +line using a number of tools. + +If you want to output annotations in swissprot format they need to be +stored in a Bio::Annotation::Collection object which is accessible +through the Bio::SeqI interface method L<annotation()|annotation>. + +The following are the names of the keys which are polled from a +L<Bio::Annotation::Collection> object. + +reference - Should contain Bio::Annotation::Reference objects +comment - Should contain Bio::Annotation::Comment objects +dblink - Should contain Bio::Annotation::DBLink objects +gene_name - Should contain Bio::Annotation::SimpleValue object + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Elia Stupka + +Email elia@tll.org.sg + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqIO::swiss; +use vars qw(@ISA); +use strict; +use Bio::SeqIO; +use Bio::SeqIO::FTHelper; +use Bio::SeqFeature::Generic; +use Bio::Species; +use Bio::Tools::SeqStats; +use Bio::Seq::SeqFactory; +use Bio::Annotation::Collection; +use Bio::Annotation::Comment; +use Bio::Annotation::Reference; +use Bio::Annotation::DBLink; +use Bio::Annotation::SimpleValue; +use Bio::Annotation::StructuredValue; + +@ISA = qw(Bio::SeqIO); + + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + + # hash for functions for decoding keys. + $self->{'_func_ftunit_hash'} = {}; + $self->_show_dna(1); # sets this to one by default. People can change it + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::RichSeq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : + + +=cut + +sub next_seq { + my ($self,@args) = @_; + my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div, + $date,$comment,@date_arr); + + my $genename = ""; + my ($annotation, %params, @features) = ( new Bio::Annotation::Collection); + + $line = $self->_readline; + + if( !defined $line) { + return undef; # no throws - end of file + } + + if( $line =~ /^\s+$/ ) { + while( defined ($line = $self->_readline) ) { + $line =~ /\S/ && last; + } + } + if( !defined $line ) { + return undef; # end of file + } + + # fixed to allow _DIVISION to be optional for bug #946 + # see bug report for more information + $line =~ /^ID\s+([^\s_]+)(_([^\s_]+))?\s+([^\s;]+);\s+([^\s;]+);/ + || $self->throw("swissprot stream with no ID. Not swissprot in my book"); + + if( $3 ) { + $name = "$1$2"; + $params{'-division'} = $3; + } else { + $name = $1; + $params{'-division'} = 'UNK'; + $params{'-primary_id'} = $1; + } + $params{'-alphabet'} = 'protein'; + # this is important to have the id for display in e.g. FTHelper, otherwise + # you won't know which entry caused an error + $params{'-display_id'} = $name; + + my $buffer = $line; + + BEFORE_FEATURE_TABLE : + until( !defined ($buffer) ) { + $_ = $buffer; + + # Exit at start of Feature table + last if /^FT/; + # and at the sequence at the latest HL 05/11/2000 + last if /^SQ/; + + # Description line(s) + if (/^DE\s+(\S.*\S)/) { + $desc .= $desc ? " $1" : $1; + } + #Gene name + elsif(/^GN\s+(.*)/) { + $genename .= " " if $genename; + $genename .= $1; + # has GN terminated yet? + if($genename =~ s/[\. ]+$//) { + my $gn = Bio::Annotation::StructuredValue->new(); + foreach my $gene (split(/ AND /, $genename)) { + $gene =~ s/^\(//; + $gene =~ s/\)$//; + $gn->add_value([-1,-1], split(/ OR /, $gene)); + } + $annotation->add_Annotation('gene_name',$gn, + "Bio::Annotation::SimpleValue"); + } + } + #accession number(s) + elsif( /^AC\s+(.+)/) { + my @accs = split(/[; ]+/, $1); # allow space in addition + $params{'-accession_number'} = shift @accs + unless defined $params{'-accession_number'}; + push @{$params{'-secondary_accessions'}}, @accs; + } + #version number + elsif( /^SV\s+(\S+);?/ ) { + my $sv = $1; + $sv =~ s/\;//; + $params{'-seq_version'} = $sv; + } + #date + elsif( /^DT\s+(.*)/ ) { + my $date = $1; + $date =~ s/\;//; + $date =~ s/\s+$//; + push @{$params{'-dates'}}, $date; + } + # Organism name and phylogenetic information + elsif (/^O[SCG]/) { + my $species = $self->_read_swissprot_Species(\$buffer); + $params{'-species'}= $species; + # now we are one line ahead -- so continue without reading the next + # line HL 05/11/2000 + next; + } + # References + elsif (/^R/) { + my $refs = $self->_read_swissprot_References(\$buffer); + + foreach my $r (@$refs) { + $annotation->add_Annotation('reference',$r); + } + # now we are one line ahead -- so continue without reading the next + # line HL 05/11/2000 + next; + } + #Comments + elsif (/^CC\s{3}(.*)/) { + $comment .= $1; + $comment .= "\n"; + while (defined ($buffer = $self->_readline)) { + if ($buffer =~ /^CC\s{3}(.*)/) { + $comment .= $1; + $comment .= "\n"; + } + else { + last; + } + } + my $commobj = Bio::Annotation::Comment->new(); + # note: don't try to process comments here -- they may contain + # structure. LP 07/30/2000 + $commobj->text($comment); + $annotation->add_Annotation('comment',$commobj); + $comment = ""; + # now we are one line ahead -- so continue without reading the next + # line HL 05/11/2000 + next; + } + #DBLinks + elsif (/^DR\s+(\S+)\;\s+(\S+)\;\s+(\S+)[\;\.](.*)$/) { + my $dblinkobj = Bio::Annotation::DBLink->new(); + $dblinkobj->database($1); + $dblinkobj->primary_id($2); + $dblinkobj->optional_id($3); + my $comment = $4; + if(length($comment) > 0) { + # edit comment to get rid of leading space and trailing dot + if( $comment =~ /^\s*(\S+)\./ ) { + $dblinkobj->comment($1); + } else { + $dblinkobj->comment($comment); + } + } + $annotation->add_Annotation('dblink',$dblinkobj); + } + #keywords + elsif( /^KW\s+(.*)$/ ) { + my @kw = split(/\s*\;\s*/,$1); + defined $kw[-1] && $kw[-1] =~ s/\.$//; + push @{$params{'-keywords'}}, @kw; + } + + + # Get next line. Getting here assumes that we indeed need to read the + # line. + $buffer = $self->_readline; + } + + $buffer = $_; + + FEATURE_TABLE : + # if there is no feature table, or if we've got beyond, exit loop or don't + # even enter HL 05/11/2000 + while (defined ($buffer) && ($buffer =~ /^FT/)) { + my $ftunit = $self->_read_FTHelper_swissprot(\$buffer); + + # process ftunit + # when parsing of the line fails we get undef returned + if($ftunit) { + push(@features, + $ftunit->_generic_seqfeature($self->location_factory(), + $params{'-seqid'}, "SwissProt")); + } else { + $self->warn("failed to parse feature table line for seq " . + $params{'-display_id'}); + } + } + if( $buffer !~ /^SQ/ ) { + while( defined($_ = $self->_readline) ) { + /^SQ/ && last; + } + } + $seqc = ""; + while( defined ($_ = $self->_readline) ) { + /^\/\// && last; + $_ = uc($_); + s/[^A-Za-z]//g; + $seqc .= $_; + } + + my $seq= $self->sequence_factory->create + (-verbose => $self->verbose, + %params, + -seq => $seqc, + -desc => $desc, + -features => \@features, + -annotation => $annotation, + ); + + # The annotation doesn't get added by the contructor + $seq->annotation($annotation); + + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object (must be seq) to the stream + Returns : 1 for success and 0 for error + Args : array of 1 to n Bio::SeqI objects + + +=cut + +sub write_seq { + my ($self,@seqs) = @_; + foreach my $seq ( @seqs ) { + $self->throw("Attempting to write with no seq!") unless defined $seq; + + if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { + $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); + } + + my $i; + my $str = $seq->seq; + + my $mol; + my $div; + my $len = $seq->length(); + + if ( !$seq->can('division') || ! defined ($div = $seq->division()) ) { + $div = 'UNK'; + } + + if( ! $seq->can('alphabet') || ! defined ($mol = $seq->alphabet) ) { + $mol = 'XXX'; + } + + my $temp_line; + if( $self->_id_generation_func ) { + $temp_line = &{$self->_id_generation_func}($seq); + } else { + #$temp_line = sprintf ("%10s STANDARD; %3s; %d AA.", + # $seq->primary_id()."_".$div,$mol,$len); + # Reconstructing the ID relies heavily upon the input source having + # been in a format that is parsed as this routine expects it -- that is, + # by this module itself. This is bad, I think, and immediately breaks + # if e.g. the Bio::DB::GenPept module is used as input. + # Hence, switch to display_id(); _every_ sequence is supposed to have + # this. HL 2000/09/03 + $mol =~ s/protein/PRT/; + $temp_line = sprintf ("%10s STANDARD; %3s; %d AA.", + $seq->display_id(), $mol, $len); + } + + $self->_print( "ID $temp_line\n"); + + # if there, write the accession line + local($^W) = 0; # supressing warnings about uninitialized fields + + if( $self->_ac_generation_func ) { + $temp_line = &{$self->_ac_generation_func}($seq); + $self->_print( "AC $temp_line\n"); + } else { + if ($seq->can('accession_number') ) { + $self->_print("AC ",$seq->accession_number,";"); + if ($seq->can('get_secondary_accessions') ) { + foreach my $sacc ($seq->get_secondary_accessions) { + $self->_print(" ",$sacc,";"); + } + $self->_print("\n"); + } + else { + $self->_print("\n"); + } + } + # otherwise - cannot print <sigh> + } + + # Date lines + + if( $seq->can('get_dates') ) { + foreach my $dt ( $seq->get_dates() ) { + $self->_write_line_swissprot_regex("DT ","DT ", + $dt,"\\s\+\|\$",80); + } + } + + #Definition lines + $self->_write_line_swissprot_regex("DE ","DE ",$seq->desc(),"\\s\+\|\$",80); + + #Gene name + if ((my @genes = $seq->annotation->get_Annotations('gene_name') ) ) { + $self->_print("GN ", + join(' OR ', + map { + $_->isa("Bio::Annotation::StructuredValue") ? + $_->value(-joins => [" AND ", " OR "]) : + $_->value(); + } @genes), + ".\n"); + } + + # Organism lines + if ($seq->can('species') && (my $spec = $seq->species)) { + my($species, @class) = $spec->classification(); + my $genus = $class[0]; + my $OS = "$genus $species"; + if ($class[$#class] =~ /viruses/i) { + # different OS / OC syntax for viruses LP 09/16/2000 + shift @class; + } + if (my $ssp = $spec->sub_species) { + $OS .= " $ssp"; + } + foreach (($spec->variant, $spec->common_name)) { + $OS .= " ($_)" if $_; + } + $self->_print( "OS $OS.\n"); + my $OC = join('; ', reverse(@class)) .'.'; + $self->_write_line_swissprot_regex("OC ","OC ",$OC,"\; \|\$",80); + if ($spec->organelle) { + $self->_write_line_swissprot_regex("OG ","OG ",$spec->organelle,"\; \|\$",80); + } + if ($spec->ncbi_taxid) { + $self->_print("OX NCBI_TaxID=".$spec->ncbi_taxid.";\n"); + } + } + + # Reference lines + my $t = 1; + foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { + $self->_print( "RN [$t]\n"); + # changed by lorenz 08/03/00 + # j.gilbert and h.lapp agreed that the rp line in swissprot seems + # more like a comment than a parseable value, so print it as is + if ($ref->rp) { + $self->_write_line_swissprot_regex("RP ","RP ",$ref->rp, + "\\s\+\|\$",80); + } + if ($ref->comment) { + $self->_write_line_swissprot_regex("RC ","RC ",$ref->comment, + "\\s\+\|\$",80); + } + if ($ref->medline) { + # new RX format in swissprot LP 09/17/00 + if ($ref->pubmed) { + $self->_write_line_swissprot_regex("RX ","RX ", + "MEDLINE=".$ref->medline. + "; PubMed=".$ref->pubmed.";", + "\\s\+\|\$",80); + } else { + $self->_write_line_swissprot_regex("RX MEDLINE; ","RX MEDLINE; ", + $ref->medline.".","\\s\+\|\$",80); + } + } + my $author = $ref->authors .';' if($ref->authors); + my $title = $ref->title .';' if( $ref->title); + + $self->_write_line_swissprot_regex("RA ","RA ",$author,"\\s\+\|\$",80); + $self->_write_line_swissprot_regex("RT ","RT ",$title,"\\s\+\|\$",80); + $self->_write_line_swissprot_regex("RL ","RL ",$ref->location,"\\s\+\|\$",80); + $t++; + } + + # Comment lines + + foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { + foreach my $cline (split ("\n", $comment->text)) { + while (length $cline > 74) { + $self->_print("CC ",(substr $cline,0,74),"\n"); + $cline = substr $cline,74; + } + $self->_print("CC ",$cline,"\n"); + } + } + + foreach my $dblink ( $seq->annotation->get_Annotations('dblink') ) + { + if (defined($dblink->comment)&&($dblink->comment)) { + $self->_print("DR ",$dblink->database,"; ",$dblink->primary_id,"; ", + $dblink->optional_id,"; ",$dblink->comment,".\n"); + } elsif($dblink->optional_id) { + $self->_print("DR ",$dblink->database,"; ", + $dblink->primary_id,"; ", + $dblink->optional_id,".\n"); + } + else { + $self->_print("DR ",$dblink->database, + "; ",$dblink->primary_id,"; ","-.\n"); + } + } + + # if there, write the kw line + { + my( $kw ); + if( my $func = $self->_kw_generation_func ) { + $kw = &{$func}($seq); + } elsif( $seq->can('keywords') ) { + $kw = $seq->keywords; + if( ref($kw) =~ /ARRAY/i ) { + $kw = join("; ", @$kw); + } + $kw .= '.' if( $kw !~ /\.$/ ); + } + $self->_write_line_swissprot_regex("KW ","KW ", + $kw, "\\s\+\|\$",80); + } + + #Check if there are seqfeatures before printing the FT line + my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : (); + if ($feats[0]) { + if( defined $self->_post_sort ) { + + # we need to read things into an array. Process. Sort them. Print 'em + + my $post_sort_func = $self->_post_sort(); + my @fth; + + foreach my $sf ( @feats ) { + push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq)); + } + @fth = sort { &$post_sort_func($a,$b) } @fth; + + foreach my $fth ( @fth ) { + $self->_print_swissprot_FTHelper($fth); + } + } else { + # not post sorted. And so we can print as we get them. + # lower memory load... + + foreach my $sf ( @feats ) { + my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq); + foreach my $fth ( @fth ) { + if( ! $fth->isa('Bio::SeqIO::FTHelper') ) { + $sf->throw("Cannot process FTHelper... $fth"); + } + + $self->_print_swissprot_FTHelper($fth); + } + } + } + + if( $self->_show_dna() == 0 ) { + return; + } + } + # finished printing features. + + # molecular weight + my $mw = ${Bio::Tools::SeqStats->get_mol_wt($seq->primary_seq)}[0]; + # checksum + # was crc32 checksum, changed it to crc64 + my $crc64 = $self->_crc64(\$str); + $self->_print( sprintf("SQ SEQUENCE %4d AA; %d MW; %16s CRC64;\n", + $len,$mw,$crc64)); + $self->_print( " "); + my $linepos; + for ($i = 0; $i < length($str); $i += 10) { + $self->_print( substr($str,$i,10), " "); + $linepos += 11; + if( ($i+10)%60 == 0 && (($i+10) < length($str))) { + $self->_print( "\n "); + } + } + $self->_print( "\n//\n"); + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; + } +} + +# Thanks to James Gilbert for the following two. LP 08/01/2000 + +=head2 _generateCRCTable + + Title : _generateCRCTable + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _generateCRCTable { + # 10001000001010010010001110000100 + # 32 + my $poly = 0xEDB88320; + my ($self) = shift; + + $self->{'_crcTable'} = []; + foreach my $i (0..255) { + my $crc = $i; + for (my $j=8; $j > 0; $j--) { + if ($crc & 1) { + $crc = ($crc >> 1) ^ $poly; + } + else { + $crc >>= 1; + } + } + ${$self->{'_crcTable'}}[$i] = $crc; + } +} + + +=head2 _crc32 + + Title : _crc32 + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _crc32 { + my( $self, $str ) = @_; + + $self->throw("Argument to crc32() must be ref to scalar") + unless ref($str) eq 'SCALAR'; + + $self->_generateCRCTable() unless exists $self->{'_crcTable'}; + + my $len = length($$str); + + my $crc = 0xFFFFFFFF; + for (my $i = 0; $i < $len; $i++) { + # Get upper case value of each letter + my $int = ord uc substr $$str, $i, 1; + $crc = (($crc >> 8) & 0x00FFFFFF) ^ + ${$self->{'_crcTable'}}[ ($crc ^ $int) & 0xFF ]; + } + return $crc; +} + +=head2 _crc64 + + Title : _crc64 + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _crc64{ + my ($self, $sequence) = @_; + my $POLY64REVh = 0xd8000000; + my @CRCTableh = 256; + my @CRCTablel = 256; + my $initialized; + + + my $seq = $$sequence; + + my $crcl = 0; + my $crch = 0; + if (!$initialized) { + $initialized = 1; + for (my $i=0; $i<256; $i++) { + my $partl = $i; + my $parth = 0; + for (my $j=0; $j<8; $j++) { + my $rflag = $partl & 1; + $partl >>= 1; + $partl |= (1 << 31) if $parth & 1; + $parth >>= 1; + $parth ^= $POLY64REVh if $rflag; + } + $CRCTableh[$i] = $parth; + $CRCTablel[$i] = $partl; + } + } + + foreach (split '', $seq) { + my $shr = ($crch & 0xFF) << 24; + my $temp1h = $crch >> 8; + my $temp1l = ($crcl >> 8) | $shr; + my $tableindex = ($crcl ^ (unpack "C", $_)) & 0xFF; + $crch = $temp1h ^ $CRCTableh[$tableindex]; + $crcl = $temp1l ^ $CRCTablel[$tableindex]; + } + my $crc64 = sprintf("%08X%08X", $crch, $crcl); + + return $crc64; + +} + +=head2 _print_swissprot_FTHelper + + Title : _print_swissprot_FTHelper + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _print_swissprot_FTHelper { + my ($self,$fth,$always_quote) = @_; + $always_quote ||= 0; + my ($start,$end) = ('?', '?'); + + if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) { + $fth->warn("$fth is not a FTHelper class. ". + "Attempting to print, but there could be tears!"); + } + + if( $fth->loc =~ /(\?|\d+|\>\d+|<\d+)?\.\.(\?|\d+|<\d+|>\d+)?/ ) { + $start = $1 if defined $1; + $end = $2 if defined $2; + + # to_FTString only returns one value when start == end, #JB955 + # so if no match is found, assume it is both start and end #JB955 + } else { + $start = $end = $fth->loc; + } + + my $desc = ""; + $desc = @{$fth->field->{"description"}}[0]."." + if exists $fth->field->{"description"}; + $self->_write_line_swissprot_regex(sprintf("FT %-8s %6s %6s ", + substr($fth->key,0,8), + $start,$end), + "FT ", + $desc.'.','\s+|$',80); +} +#' + +=head2 _read_swissprot_References + + Title : _read_swissprot_References + Usage : + Function: Reads references from swissprot format. Internal function really + Example : + Returns : + Args : + + +=cut + +sub _read_swissprot_References{ + my ($self,$buffer) = @_; + my (@refs); + my ($b1, $b2, $rp, $title, $loc, $au, $med, $com, $pubmed); + + if ($$buffer !~ /^RP/) { + $$buffer = $self->_readline; + } + if( !defined $$buffer ) { return undef; } + if( $$buffer =~ /^RP/ ) { + if ($$buffer =~ /^RP (SEQUENCE OF (\d+)-(\d+).*)/) { + $rp=$1; + $b1=$2; + $b2=$3; + } + elsif ($$buffer =~ /^RP (.*)/) { + $rp=$1; + } + + } + while( defined ($_ = $self->_readline) ) { + #/^CC/ && last; + /^RN/ && last; # separator between references ! LP 07/25/2000 + #/^SQ/ && last; # there may be sequences without CC lines! HL 05/11/2000 + /^[^R]/ && last; # may be the safest exit point HL 05/11/2000 + /^RX MEDLINE;\s+(\d+)/ && do {$med=$1}; + /^RX MEDLINE=(\d+);\s+PubMed=(\d+);/ && do {$med=$1;$pubmed=$2}; + /^RA (.*)/ && do { $au .= $au ? " $1" : $1; next;}; + /^RT (.*)/ && do { $title .= $title ? " $1" : $1; next;}; + /^RL (.*)/ && do { $loc .= $loc ? " $1" : $1; next;}; + /^RC (.*)/ && do { $com .= $com ? " $1" : $1; next;}; + } + + my $ref = new Bio::Annotation::Reference; + $au =~ s/;\s*$//g; + if( defined $title ) { + $title =~ s/;\s*$//g; + } + + $ref->start($b1); + $ref->end($b2); + $ref->authors($au); + $ref->title($title); + $ref->location($loc); + $ref->medline($med); + $ref->pubmed($pubmed) if (defined $pubmed); + $ref->comment($com); + $ref->rp($rp); + + push(@refs,$ref); + $$buffer = $_; + return \@refs; +} + + +=head2 _read_swissprot_Species + + Title : _read_swissprot_Species + Usage : + Function: Reads the swissprot Organism species and classification + lines. + Example : + Returns : A Bio::Species object + Args : + +=cut + +sub _read_swissprot_Species { + my( $self, $buffer ) = @_; + my $org; + + $_ = $$buffer; + my( $subspecies, $species, $genus, $common, $variant, $ncbi_taxid ); + my @class; + my ($binomial, $descr); + my $osline = ""; + + while (defined( $_ ||= $self->_readline )) { + last unless /^O[SCGX]/; + # believe it or not, but OS may come multiple times -- at this time + # we can't capture multiple species + if(/^OS\s+(\S.+)/ && (! defined($binomial))) { + $osline .= " " if $osline; + $osline .= $1; + if($osline =~ s/(,|, and|\.)$//) { + ($binomial, $descr) = $osline =~ /(\S[^\(]+)(.*)/; + ($genus, $species, $subspecies) = split(/\s+/, $binomial); + $species = "sp." unless $species; + while($descr =~ /\(([^\)]+)\)/g) { + my $item = $1; + # strain etc may not necessarily come first (yes, swissprot + # is messy) + if((! defined($variant)) && + (($item =~ /(^|[^\(\w])([Ss]train|isolate|serogroup|serotype|subtype|clone)\b/) || + ($item =~ /^(biovar|pv\.|type\s+)/))) { + $variant = $item; + } elsif($item =~ s/^subsp\.\s+//) { + if(! $subspecies) { + $subspecies = $item; + } elsif(! $variant) { + $variant = $item; + } + } elsif(! defined($common)) { + # we're only interested in the first common name + $common = $item; + if((index($common, '(') >= 0) && + (index($common, ')') < 0)) { + $common .= ')'; + } + } + } + } + } + elsif (s/^OC\s+//) { + push(@class, split /[\;\.]\s*/); + if($class[0] =~ /viruses/i) { + # viruses have different OS/OC syntax + my @virusnames = split(/\s+/, $binomial); + $species = (@virusnames > 1) ? pop(@virusnames) : ''; + $genus = join(" ", @virusnames); + $subspecies = undef; + } + } + elsif (/^OG\s+(.*)/) { + $org = $1; + } + elsif (/^OX\s+(.*)/ && (! defined($ncbi_taxid))) { + my $taxstring = $1; + # we only keep the first one and ignore all others + if ($taxstring =~ /NCBI_TaxID=([\w\d]+)/) { + $ncbi_taxid = $1; + } else { + $self->throw("$taxstring doesn't look like NCBI_TaxID"); + } + } + + $_ = undef; # Empty $_ to trigger read of next line + } + + $$buffer = $_; + + # Don't make a species object if it is "Unknown" or "None" + return if $genus =~ /^(Unknown|None)$/i; + + if ($class[$#class] eq $genus) { + push( @class, $species ); + } else { + push( @class, $genus, $species ); + } + + @class = reverse @class; + + my $taxon = Bio::Species->new(); + $taxon->classification( \@class, "FORCE" ); # no name validation please + $taxon->common_name( $common ) if $common; + $taxon->sub_species( $subspecies ) if $subspecies; + $taxon->organelle ( $org ) if $org; + $taxon->ncbi_taxid ( $ncbi_taxid ) if $ncbi_taxid; + $taxon->variant($variant) if $variant; + + # done + return $taxon; +} + +=head2 _filehandle + + Title : _filehandle + Usage : $obj->_filehandle($newval) + Function: + Example : + Returns : value of _filehandle + Args : newvalue (optional) + + +=cut + +# inherited from SeqIO.pm ! HL 05/11/2000 + +=head2 _read_FTHelper_swissprot + + Title : _read_FTHelper_swissprot + Usage : _read_FTHelper_swissprot(\$buffer) + Function: reads the next FT key line + Example : + Returns : Bio::SeqIO::FTHelper object + Args : filehandle and reference to a scalar + + +=cut + +sub _read_FTHelper_swissprot { + # initial version implemented by HL 05/10/2000 + # FIXME this may not be perfect, so please review + my ($self,$buffer) = @_; + my ($key, # The key of the feature + $loc, # The location line from the feature + $desc, # The descriptive text + ); + + if ($$buffer =~ /^FT (\w+)\s+([\d\?\<]+)\s+([\d\?\>]+)\s*(.*)$/) { + $key = $1; + my $loc1 = $2; + my $loc2 = $3; + $loc = "$loc1..$loc2"; + if($4 && (length($4) > 0)) { + $desc = $4; + chomp($desc); + } else { + $desc = ""; + } + # Read all the continuation lines up to the next feature + while (defined($_ = $self->_readline) && /^FT\s{20,}(\S.*)$/) { + $desc .= $1; + chomp($desc); + } + $desc =~ s/\.$//; + } else { + # No feature key. What's this? + $self->warn("No feature key in putative feature table line: $_"); + return; + } + + # Put the first line of the next feature into the buffer + $$buffer = $_; + + # Make the new FTHelper object + my $out = new Bio::SeqIO::FTHelper(-verbose => $self->verbose()); + $out->key($key); + $out->loc($loc); + + # store the description if there is one + if($desc && (length($desc) > 0)) { + $out->field->{"description"} ||= []; + push(@{$out->field->{"description"}}, $desc); + } + return $out; +} + + +=head2 _write_line_swissprot + + Title : _write_line_swissprot + Usage : + Function: internal function + Example : + Returns : + Args : + + +=cut + +sub _write_line_swissprot{ + my ($self,$pre1,$pre2,$line,$length) = @_; + + $length || die "Miscalled write_line_swissprot without length. Programming error!"; + my $subl = $length - length $pre2; + my $linel = length $line; + my $i; + + my $sub = substr($line,0,$length - length $pre1); + + $self->_print( "$pre1$sub\n"); + + for($i= ($length - length $pre1);$i < $linel;) { + $sub = substr($line,$i,($subl)); + $self->_print( "$pre2$sub\n"); + $i += $subl; + } + +} + +=head2 _write_line_swissprot_regex + + Title : _write_line_swissprot_regex + Usage : + Function: internal function for writing lines of specified + length, with different first and the next line + left hand headers and split at specific points in the + text + Example : + Returns : nothing + Args : file handle, first header, second header, text-line, regex for line breaks, total line length + + +=cut + +sub _write_line_swissprot_regex { + my ($self,$pre1,$pre2,$line,$regex,$length) = @_; + + #print STDOUT "Going to print with $line!\n"; + + $length || die "Miscalled write_line_swissprot without length. Programming error!"; + + if( length $pre1 != length $pre2 ) { + print STDERR "len 1 is ", length $pre1, " len 2 is ", length $pre2, "\n"; + die "Programming error - cannot called write_line_swissprot_regex with different length \npre1 ($pre1) and \npre2 ($pre2) tags!"; + } + + my $subl = $length - (length $pre1) -1 ; + my @lines; + + while($line =~ m/(.{1,$subl})($regex)/g) { + push(@lines, $1.$2); + } + + my $s = shift @lines; + $self->_print( "$pre1$s\n"); + foreach my $s ( @lines ) { + $self->_print( "$pre2$s\n"); + } +} + +=head2 _post_sort + + Title : _post_sort + Usage : $obj->_post_sort($newval) + Function: + Returns : value of _post_sort + Args : newvalue (optional) + + +=cut + +sub _post_sort{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_post_sort'} = $value; + } + return $obj->{'_post_sort'}; + +} + +=head2 _show_dna + + Title : _show_dna + Usage : $obj->_show_dna($newval) + Function: + Returns : value of _show_dna + Args : newvalue (optional) + + +=cut + +sub _show_dna{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_show_dna'} = $value; + } + return $obj->{'_show_dna'}; + +} + +=head2 _id_generation_func + + Title : _id_generation_func + Usage : $obj->_id_generation_func($newval) + Function: + Returns : value of _id_generation_func + Args : newvalue (optional) + + +=cut + +sub _id_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_id_generation_func'} = $value; + } + return $obj->{'_id_generation_func'}; + +} + +=head2 _ac_generation_func + + Title : _ac_generation_func + Usage : $obj->_ac_generation_func($newval) + Function: + Returns : value of _ac_generation_func + Args : newvalue (optional) + + +=cut + +sub _ac_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_ac_generation_func'} = $value; + } + return $obj->{'_ac_generation_func'}; + +} + +=head2 _sv_generation_func + + Title : _sv_generation_func + Usage : $obj->_sv_generation_func($newval) + Function: + Returns : value of _sv_generation_func + Args : newvalue (optional) + + +=cut + +sub _sv_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_sv_generation_func'} = $value; + } + return $obj->{'_sv_generation_func'}; + +} + +=head2 _kw_generation_func + + Title : _kw_generation_func + Usage : $obj->_kw_generation_func($newval) + Function: + Returns : value of _kw_generation_func + Args : newvalue (optional) + + +=cut + +sub _kw_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_kw_generation_func'} = $value; + } + return $obj->{'_kw_generation_func'}; + +} + +1;