comparison variant_effect_predictor/Bio/SeqIO/swiss.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: swiss.pm,v 1.66.2.4 2003/09/13 22:16:43 jason Exp $
2 #
3 # BioPerl module for Bio::SeqIO::swiss
4 #
5 # Cared for by Elia Stupka <elia@tll.org.sg>
6 #
7 # Copyright Elia Stupka
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::SeqIO::swiss - Swissprot sequence input/output stream
16
17 =head1 SYNOPSIS
18
19 It is probably best not to use this object directly, but
20 rather go through the SeqIO handler system. Go:
21
22 $stream = Bio::SeqIO->new(-file => $filename, -format => 'swiss');
23
24 while ( my $seq = $stream->next_seq() ) {
25 # do something with $seq
26 }
27
28 =head1 DESCRIPTION
29
30 This object can transform Bio::Seq objects to and from swissprot flat
31 file databases.
32
33 There is a lot of flexibility here about how to dump things which I need
34 to document fully.
35
36
37 =head2 Optional functions
38
39 =over 3
40
41 =item _show_dna()
42
43 (output only) shows the dna or not
44
45 =item _post_sort()
46
47 (output only) provides a sorting func which is applied to the FTHelpers
48 before printing
49
50 =item _id_generation_func()
51
52 This is function which is called as
53
54 print "ID ", $func($seq), "\n";
55
56 To generate the ID line. If it is not there, it generates a sensible ID
57 line using a number of tools.
58
59 If you want to output annotations in swissprot format they need to be
60 stored in a Bio::Annotation::Collection object which is accessible
61 through the Bio::SeqI interface method L<annotation()|annotation>.
62
63 The following are the names of the keys which are polled from a
64 L<Bio::Annotation::Collection> object.
65
66 reference - Should contain Bio::Annotation::Reference objects
67 comment - Should contain Bio::Annotation::Comment objects
68 dblink - Should contain Bio::Annotation::DBLink objects
69 gene_name - Should contain Bio::Annotation::SimpleValue object
70
71 =back
72
73 =head1 FEEDBACK
74
75 =head2 Mailing Lists
76
77 User feedback is an integral part of the evolution of this
78 and other Bioperl modules. Send your comments and suggestions preferably
79 to one of the Bioperl mailing lists.
80 Your participation is much appreciated.
81
82 bioperl-l@bioperl.org - General discussion
83 http://bio.perl.org/MailList.html - About the mailing lists
84
85 =head2 Reporting Bugs
86
87 Report bugs to the Bioperl bug tracking system to help us keep track
88 the bugs and their resolution.
89 Bug reports can be submitted via email or the web:
90
91 bioperl-bugs@bio.perl.org
92 http://bugzilla.bioperl.org/
93
94 =head1 AUTHOR - Elia Stupka
95
96 Email elia@tll.org.sg
97
98 Describe contact details here
99
100 =head1 APPENDIX
101
102 The rest of the documentation details each of the object
103 methods. Internal methods are usually preceded with a _
104
105 =cut
106
107
108 # Let the code begin...
109
110
111 package Bio::SeqIO::swiss;
112 use vars qw(@ISA);
113 use strict;
114 use Bio::SeqIO;
115 use Bio::SeqIO::FTHelper;
116 use Bio::SeqFeature::Generic;
117 use Bio::Species;
118 use Bio::Tools::SeqStats;
119 use Bio::Seq::SeqFactory;
120 use Bio::Annotation::Collection;
121 use Bio::Annotation::Comment;
122 use Bio::Annotation::Reference;
123 use Bio::Annotation::DBLink;
124 use Bio::Annotation::SimpleValue;
125 use Bio::Annotation::StructuredValue;
126
127 @ISA = qw(Bio::SeqIO);
128
129
130 sub _initialize {
131 my($self,@args) = @_;
132 $self->SUPER::_initialize(@args);
133
134 # hash for functions for decoding keys.
135 $self->{'_func_ftunit_hash'} = {};
136 $self->_show_dna(1); # sets this to one by default. People can change it
137 if( ! defined $self->sequence_factory ) {
138 $self->sequence_factory(new Bio::Seq::SeqFactory
139 (-verbose => $self->verbose(),
140 -type => 'Bio::Seq::RichSeq'));
141 }
142 }
143
144 =head2 next_seq
145
146 Title : next_seq
147 Usage : $seq = $stream->next_seq()
148 Function: returns the next sequence in the stream
149 Returns : Bio::Seq object
150 Args :
151
152
153 =cut
154
155 sub next_seq {
156 my ($self,@args) = @_;
157 my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
158 $date,$comment,@date_arr);
159
160 my $genename = "";
161 my ($annotation, %params, @features) = ( new Bio::Annotation::Collection);
162
163 $line = $self->_readline;
164
165 if( !defined $line) {
166 return undef; # no throws - end of file
167 }
168
169 if( $line =~ /^\s+$/ ) {
170 while( defined ($line = $self->_readline) ) {
171 $line =~ /\S/ && last;
172 }
173 }
174 if( !defined $line ) {
175 return undef; # end of file
176 }
177
178 # fixed to allow _DIVISION to be optional for bug #946
179 # see bug report for more information
180 $line =~ /^ID\s+([^\s_]+)(_([^\s_]+))?\s+([^\s;]+);\s+([^\s;]+);/
181 || $self->throw("swissprot stream with no ID. Not swissprot in my book");
182
183 if( $3 ) {
184 $name = "$1$2";
185 $params{'-division'} = $3;
186 } else {
187 $name = $1;
188 $params{'-division'} = 'UNK';
189 $params{'-primary_id'} = $1;
190 }
191 $params{'-alphabet'} = 'protein';
192 # this is important to have the id for display in e.g. FTHelper, otherwise
193 # you won't know which entry caused an error
194 $params{'-display_id'} = $name;
195
196 my $buffer = $line;
197
198 BEFORE_FEATURE_TABLE :
199 until( !defined ($buffer) ) {
200 $_ = $buffer;
201
202 # Exit at start of Feature table
203 last if /^FT/;
204 # and at the sequence at the latest HL 05/11/2000
205 last if /^SQ/;
206
207 # Description line(s)
208 if (/^DE\s+(\S.*\S)/) {
209 $desc .= $desc ? " $1" : $1;
210 }
211 #Gene name
212 elsif(/^GN\s+(.*)/) {
213 $genename .= " " if $genename;
214 $genename .= $1;
215 # has GN terminated yet?
216 if($genename =~ s/[\. ]+$//) {
217 my $gn = Bio::Annotation::StructuredValue->new();
218 foreach my $gene (split(/ AND /, $genename)) {
219 $gene =~ s/^\(//;
220 $gene =~ s/\)$//;
221 $gn->add_value([-1,-1], split(/ OR /, $gene));
222 }
223 $annotation->add_Annotation('gene_name',$gn,
224 "Bio::Annotation::SimpleValue");
225 }
226 }
227 #accession number(s)
228 elsif( /^AC\s+(.+)/) {
229 my @accs = split(/[; ]+/, $1); # allow space in addition
230 $params{'-accession_number'} = shift @accs
231 unless defined $params{'-accession_number'};
232 push @{$params{'-secondary_accessions'}}, @accs;
233 }
234 #version number
235 elsif( /^SV\s+(\S+);?/ ) {
236 my $sv = $1;
237 $sv =~ s/\;//;
238 $params{'-seq_version'} = $sv;
239 }
240 #date
241 elsif( /^DT\s+(.*)/ ) {
242 my $date = $1;
243 $date =~ s/\;//;
244 $date =~ s/\s+$//;
245 push @{$params{'-dates'}}, $date;
246 }
247 # Organism name and phylogenetic information
248 elsif (/^O[SCG]/) {
249 my $species = $self->_read_swissprot_Species(\$buffer);
250 $params{'-species'}= $species;
251 # now we are one line ahead -- so continue without reading the next
252 # line HL 05/11/2000
253 next;
254 }
255 # References
256 elsif (/^R/) {
257 my $refs = $self->_read_swissprot_References(\$buffer);
258
259 foreach my $r (@$refs) {
260 $annotation->add_Annotation('reference',$r);
261 }
262 # now we are one line ahead -- so continue without reading the next
263 # line HL 05/11/2000
264 next;
265 }
266 #Comments
267 elsif (/^CC\s{3}(.*)/) {
268 $comment .= $1;
269 $comment .= "\n";
270 while (defined ($buffer = $self->_readline)) {
271 if ($buffer =~ /^CC\s{3}(.*)/) {
272 $comment .= $1;
273 $comment .= "\n";
274 }
275 else {
276 last;
277 }
278 }
279 my $commobj = Bio::Annotation::Comment->new();
280 # note: don't try to process comments here -- they may contain
281 # structure. LP 07/30/2000
282 $commobj->text($comment);
283 $annotation->add_Annotation('comment',$commobj);
284 $comment = "";
285 # now we are one line ahead -- so continue without reading the next
286 # line HL 05/11/2000
287 next;
288 }
289 #DBLinks
290 elsif (/^DR\s+(\S+)\;\s+(\S+)\;\s+(\S+)[\;\.](.*)$/) {
291 my $dblinkobj = Bio::Annotation::DBLink->new();
292 $dblinkobj->database($1);
293 $dblinkobj->primary_id($2);
294 $dblinkobj->optional_id($3);
295 my $comment = $4;
296 if(length($comment) > 0) {
297 # edit comment to get rid of leading space and trailing dot
298 if( $comment =~ /^\s*(\S+)\./ ) {
299 $dblinkobj->comment($1);
300 } else {
301 $dblinkobj->comment($comment);
302 }
303 }
304 $annotation->add_Annotation('dblink',$dblinkobj);
305 }
306 #keywords
307 elsif( /^KW\s+(.*)$/ ) {
308 my @kw = split(/\s*\;\s*/,$1);
309 defined $kw[-1] && $kw[-1] =~ s/\.$//;
310 push @{$params{'-keywords'}}, @kw;
311 }
312
313
314 # Get next line. Getting here assumes that we indeed need to read the
315 # line.
316 $buffer = $self->_readline;
317 }
318
319 $buffer = $_;
320
321 FEATURE_TABLE :
322 # if there is no feature table, or if we've got beyond, exit loop or don't
323 # even enter HL 05/11/2000
324 while (defined ($buffer) && ($buffer =~ /^FT/)) {
325 my $ftunit = $self->_read_FTHelper_swissprot(\$buffer);
326
327 # process ftunit
328 # when parsing of the line fails we get undef returned
329 if($ftunit) {
330 push(@features,
331 $ftunit->_generic_seqfeature($self->location_factory(),
332 $params{'-seqid'}, "SwissProt"));
333 } else {
334 $self->warn("failed to parse feature table line for seq " .
335 $params{'-display_id'});
336 }
337 }
338 if( $buffer !~ /^SQ/ ) {
339 while( defined($_ = $self->_readline) ) {
340 /^SQ/ && last;
341 }
342 }
343 $seqc = "";
344 while( defined ($_ = $self->_readline) ) {
345 /^\/\// && last;
346 $_ = uc($_);
347 s/[^A-Za-z]//g;
348 $seqc .= $_;
349 }
350
351 my $seq= $self->sequence_factory->create
352 (-verbose => $self->verbose,
353 %params,
354 -seq => $seqc,
355 -desc => $desc,
356 -features => \@features,
357 -annotation => $annotation,
358 );
359
360 # The annotation doesn't get added by the contructor
361 $seq->annotation($annotation);
362
363 return $seq;
364 }
365
366 =head2 write_seq
367
368 Title : write_seq
369 Usage : $stream->write_seq($seq)
370 Function: writes the $seq object (must be seq) to the stream
371 Returns : 1 for success and 0 for error
372 Args : array of 1 to n Bio::SeqI objects
373
374
375 =cut
376
377 sub write_seq {
378 my ($self,@seqs) = @_;
379 foreach my $seq ( @seqs ) {
380 $self->throw("Attempting to write with no seq!") unless defined $seq;
381
382 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
383 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
384 }
385
386 my $i;
387 my $str = $seq->seq;
388
389 my $mol;
390 my $div;
391 my $len = $seq->length();
392
393 if ( !$seq->can('division') || ! defined ($div = $seq->division()) ) {
394 $div = 'UNK';
395 }
396
397 if( ! $seq->can('alphabet') || ! defined ($mol = $seq->alphabet) ) {
398 $mol = 'XXX';
399 }
400
401 my $temp_line;
402 if( $self->_id_generation_func ) {
403 $temp_line = &{$self->_id_generation_func}($seq);
404 } else {
405 #$temp_line = sprintf ("%10s STANDARD; %3s; %d AA.",
406 # $seq->primary_id()."_".$div,$mol,$len);
407 # Reconstructing the ID relies heavily upon the input source having
408 # been in a format that is parsed as this routine expects it -- that is,
409 # by this module itself. This is bad, I think, and immediately breaks
410 # if e.g. the Bio::DB::GenPept module is used as input.
411 # Hence, switch to display_id(); _every_ sequence is supposed to have
412 # this. HL 2000/09/03
413 $mol =~ s/protein/PRT/;
414 $temp_line = sprintf ("%10s STANDARD; %3s; %d AA.",
415 $seq->display_id(), $mol, $len);
416 }
417
418 $self->_print( "ID $temp_line\n");
419
420 # if there, write the accession line
421 local($^W) = 0; # supressing warnings about uninitialized fields
422
423 if( $self->_ac_generation_func ) {
424 $temp_line = &{$self->_ac_generation_func}($seq);
425 $self->_print( "AC $temp_line\n");
426 } else {
427 if ($seq->can('accession_number') ) {
428 $self->_print("AC ",$seq->accession_number,";");
429 if ($seq->can('get_secondary_accessions') ) {
430 foreach my $sacc ($seq->get_secondary_accessions) {
431 $self->_print(" ",$sacc,";");
432 }
433 $self->_print("\n");
434 }
435 else {
436 $self->_print("\n");
437 }
438 }
439 # otherwise - cannot print <sigh>
440 }
441
442 # Date lines
443
444 if( $seq->can('get_dates') ) {
445 foreach my $dt ( $seq->get_dates() ) {
446 $self->_write_line_swissprot_regex("DT ","DT ",
447 $dt,"\\s\+\|\$",80);
448 }
449 }
450
451 #Definition lines
452 $self->_write_line_swissprot_regex("DE ","DE ",$seq->desc(),"\\s\+\|\$",80);
453
454 #Gene name
455 if ((my @genes = $seq->annotation->get_Annotations('gene_name') ) ) {
456 $self->_print("GN ",
457 join(' OR ',
458 map {
459 $_->isa("Bio::Annotation::StructuredValue") ?
460 $_->value(-joins => [" AND ", " OR "]) :
461 $_->value();
462 } @genes),
463 ".\n");
464 }
465
466 # Organism lines
467 if ($seq->can('species') && (my $spec = $seq->species)) {
468 my($species, @class) = $spec->classification();
469 my $genus = $class[0];
470 my $OS = "$genus $species";
471 if ($class[$#class] =~ /viruses/i) {
472 # different OS / OC syntax for viruses LP 09/16/2000
473 shift @class;
474 }
475 if (my $ssp = $spec->sub_species) {
476 $OS .= " $ssp";
477 }
478 foreach (($spec->variant, $spec->common_name)) {
479 $OS .= " ($_)" if $_;
480 }
481 $self->_print( "OS $OS.\n");
482 my $OC = join('; ', reverse(@class)) .'.';
483 $self->_write_line_swissprot_regex("OC ","OC ",$OC,"\; \|\$",80);
484 if ($spec->organelle) {
485 $self->_write_line_swissprot_regex("OG ","OG ",$spec->organelle,"\; \|\$",80);
486 }
487 if ($spec->ncbi_taxid) {
488 $self->_print("OX NCBI_TaxID=".$spec->ncbi_taxid.";\n");
489 }
490 }
491
492 # Reference lines
493 my $t = 1;
494 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
495 $self->_print( "RN [$t]\n");
496 # changed by lorenz 08/03/00
497 # j.gilbert and h.lapp agreed that the rp line in swissprot seems
498 # more like a comment than a parseable value, so print it as is
499 if ($ref->rp) {
500 $self->_write_line_swissprot_regex("RP ","RP ",$ref->rp,
501 "\\s\+\|\$",80);
502 }
503 if ($ref->comment) {
504 $self->_write_line_swissprot_regex("RC ","RC ",$ref->comment,
505 "\\s\+\|\$",80);
506 }
507 if ($ref->medline) {
508 # new RX format in swissprot LP 09/17/00
509 if ($ref->pubmed) {
510 $self->_write_line_swissprot_regex("RX ","RX ",
511 "MEDLINE=".$ref->medline.
512 "; PubMed=".$ref->pubmed.";",
513 "\\s\+\|\$",80);
514 } else {
515 $self->_write_line_swissprot_regex("RX MEDLINE; ","RX MEDLINE; ",
516 $ref->medline.".","\\s\+\|\$",80);
517 }
518 }
519 my $author = $ref->authors .';' if($ref->authors);
520 my $title = $ref->title .';' if( $ref->title);
521
522 $self->_write_line_swissprot_regex("RA ","RA ",$author,"\\s\+\|\$",80);
523 $self->_write_line_swissprot_regex("RT ","RT ",$title,"\\s\+\|\$",80);
524 $self->_write_line_swissprot_regex("RL ","RL ",$ref->location,"\\s\+\|\$",80);
525 $t++;
526 }
527
528 # Comment lines
529
530 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
531 foreach my $cline (split ("\n", $comment->text)) {
532 while (length $cline > 74) {
533 $self->_print("CC ",(substr $cline,0,74),"\n");
534 $cline = substr $cline,74;
535 }
536 $self->_print("CC ",$cline,"\n");
537 }
538 }
539
540 foreach my $dblink ( $seq->annotation->get_Annotations('dblink') )
541 {
542 if (defined($dblink->comment)&&($dblink->comment)) {
543 $self->_print("DR ",$dblink->database,"; ",$dblink->primary_id,"; ",
544 $dblink->optional_id,"; ",$dblink->comment,".\n");
545 } elsif($dblink->optional_id) {
546 $self->_print("DR ",$dblink->database,"; ",
547 $dblink->primary_id,"; ",
548 $dblink->optional_id,".\n");
549 }
550 else {
551 $self->_print("DR ",$dblink->database,
552 "; ",$dblink->primary_id,"; ","-.\n");
553 }
554 }
555
556 # if there, write the kw line
557 {
558 my( $kw );
559 if( my $func = $self->_kw_generation_func ) {
560 $kw = &{$func}($seq);
561 } elsif( $seq->can('keywords') ) {
562 $kw = $seq->keywords;
563 if( ref($kw) =~ /ARRAY/i ) {
564 $kw = join("; ", @$kw);
565 }
566 $kw .= '.' if( $kw !~ /\.$/ );
567 }
568 $self->_write_line_swissprot_regex("KW ","KW ",
569 $kw, "\\s\+\|\$",80);
570 }
571
572 #Check if there are seqfeatures before printing the FT line
573 my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
574 if ($feats[0]) {
575 if( defined $self->_post_sort ) {
576
577 # we need to read things into an array. Process. Sort them. Print 'em
578
579 my $post_sort_func = $self->_post_sort();
580 my @fth;
581
582 foreach my $sf ( @feats ) {
583 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
584 }
585 @fth = sort { &$post_sort_func($a,$b) } @fth;
586
587 foreach my $fth ( @fth ) {
588 $self->_print_swissprot_FTHelper($fth);
589 }
590 } else {
591 # not post sorted. And so we can print as we get them.
592 # lower memory load...
593
594 foreach my $sf ( @feats ) {
595 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
596 foreach my $fth ( @fth ) {
597 if( ! $fth->isa('Bio::SeqIO::FTHelper') ) {
598 $sf->throw("Cannot process FTHelper... $fth");
599 }
600
601 $self->_print_swissprot_FTHelper($fth);
602 }
603 }
604 }
605
606 if( $self->_show_dna() == 0 ) {
607 return;
608 }
609 }
610 # finished printing features.
611
612 # molecular weight
613 my $mw = ${Bio::Tools::SeqStats->get_mol_wt($seq->primary_seq)}[0];
614 # checksum
615 # was crc32 checksum, changed it to crc64
616 my $crc64 = $self->_crc64(\$str);
617 $self->_print( sprintf("SQ SEQUENCE %4d AA; %d MW; %16s CRC64;\n",
618 $len,$mw,$crc64));
619 $self->_print( " ");
620 my $linepos;
621 for ($i = 0; $i < length($str); $i += 10) {
622 $self->_print( substr($str,$i,10), " ");
623 $linepos += 11;
624 if( ($i+10)%60 == 0 && (($i+10) < length($str))) {
625 $self->_print( "\n ");
626 }
627 }
628 $self->_print( "\n//\n");
629
630 $self->flush if $self->_flush_on_write && defined $self->_fh;
631 return 1;
632 }
633 }
634
635 # Thanks to James Gilbert for the following two. LP 08/01/2000
636
637 =head2 _generateCRCTable
638
639 Title : _generateCRCTable
640 Usage :
641 Function:
642 Example :
643 Returns :
644 Args :
645
646
647 =cut
648
649 sub _generateCRCTable {
650 # 10001000001010010010001110000100
651 # 32
652 my $poly = 0xEDB88320;
653 my ($self) = shift;
654
655 $self->{'_crcTable'} = [];
656 foreach my $i (0..255) {
657 my $crc = $i;
658 for (my $j=8; $j > 0; $j--) {
659 if ($crc & 1) {
660 $crc = ($crc >> 1) ^ $poly;
661 }
662 else {
663 $crc >>= 1;
664 }
665 }
666 ${$self->{'_crcTable'}}[$i] = $crc;
667 }
668 }
669
670
671 =head2 _crc32
672
673 Title : _crc32
674 Usage :
675 Function:
676 Example :
677 Returns :
678 Args :
679
680
681 =cut
682
683 sub _crc32 {
684 my( $self, $str ) = @_;
685
686 $self->throw("Argument to crc32() must be ref to scalar")
687 unless ref($str) eq 'SCALAR';
688
689 $self->_generateCRCTable() unless exists $self->{'_crcTable'};
690
691 my $len = length($$str);
692
693 my $crc = 0xFFFFFFFF;
694 for (my $i = 0; $i < $len; $i++) {
695 # Get upper case value of each letter
696 my $int = ord uc substr $$str, $i, 1;
697 $crc = (($crc >> 8) & 0x00FFFFFF) ^
698 ${$self->{'_crcTable'}}[ ($crc ^ $int) & 0xFF ];
699 }
700 return $crc;
701 }
702
703 =head2 _crc64
704
705 Title : _crc64
706 Usage :
707 Function:
708 Example :
709 Returns :
710 Args :
711
712
713 =cut
714
715 sub _crc64{
716 my ($self, $sequence) = @_;
717 my $POLY64REVh = 0xd8000000;
718 my @CRCTableh = 256;
719 my @CRCTablel = 256;
720 my $initialized;
721
722
723 my $seq = $$sequence;
724
725 my $crcl = 0;
726 my $crch = 0;
727 if (!$initialized) {
728 $initialized = 1;
729 for (my $i=0; $i<256; $i++) {
730 my $partl = $i;
731 my $parth = 0;
732 for (my $j=0; $j<8; $j++) {
733 my $rflag = $partl & 1;
734 $partl >>= 1;
735 $partl |= (1 << 31) if $parth & 1;
736 $parth >>= 1;
737 $parth ^= $POLY64REVh if $rflag;
738 }
739 $CRCTableh[$i] = $parth;
740 $CRCTablel[$i] = $partl;
741 }
742 }
743
744 foreach (split '', $seq) {
745 my $shr = ($crch & 0xFF) << 24;
746 my $temp1h = $crch >> 8;
747 my $temp1l = ($crcl >> 8) | $shr;
748 my $tableindex = ($crcl ^ (unpack "C", $_)) & 0xFF;
749 $crch = $temp1h ^ $CRCTableh[$tableindex];
750 $crcl = $temp1l ^ $CRCTablel[$tableindex];
751 }
752 my $crc64 = sprintf("%08X%08X", $crch, $crcl);
753
754 return $crc64;
755
756 }
757
758 =head2 _print_swissprot_FTHelper
759
760 Title : _print_swissprot_FTHelper
761 Usage :
762 Function:
763 Example :
764 Returns :
765 Args :
766
767
768 =cut
769
770 sub _print_swissprot_FTHelper {
771 my ($self,$fth,$always_quote) = @_;
772 $always_quote ||= 0;
773 my ($start,$end) = ('?', '?');
774
775 if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
776 $fth->warn("$fth is not a FTHelper class. ".
777 "Attempting to print, but there could be tears!");
778 }
779
780 if( $fth->loc =~ /(\?|\d+|\>\d+|<\d+)?\.\.(\?|\d+|<\d+|>\d+)?/ ) {
781 $start = $1 if defined $1;
782 $end = $2 if defined $2;
783
784 # to_FTString only returns one value when start == end, #JB955
785 # so if no match is found, assume it is both start and end #JB955
786 } else {
787 $start = $end = $fth->loc;
788 }
789
790 my $desc = "";
791 $desc = @{$fth->field->{"description"}}[0]."."
792 if exists $fth->field->{"description"};
793 $self->_write_line_swissprot_regex(sprintf("FT %-8s %6s %6s ",
794 substr($fth->key,0,8),
795 $start,$end),
796 "FT ",
797 $desc.'.','\s+|$',80);
798 }
799 #'
800
801 =head2 _read_swissprot_References
802
803 Title : _read_swissprot_References
804 Usage :
805 Function: Reads references from swissprot format. Internal function really
806 Example :
807 Returns :
808 Args :
809
810
811 =cut
812
813 sub _read_swissprot_References{
814 my ($self,$buffer) = @_;
815 my (@refs);
816 my ($b1, $b2, $rp, $title, $loc, $au, $med, $com, $pubmed);
817
818 if ($$buffer !~ /^RP/) {
819 $$buffer = $self->_readline;
820 }
821 if( !defined $$buffer ) { return undef; }
822 if( $$buffer =~ /^RP/ ) {
823 if ($$buffer =~ /^RP (SEQUENCE OF (\d+)-(\d+).*)/) {
824 $rp=$1;
825 $b1=$2;
826 $b2=$3;
827 }
828 elsif ($$buffer =~ /^RP (.*)/) {
829 $rp=$1;
830 }
831
832 }
833 while( defined ($_ = $self->_readline) ) {
834 #/^CC/ && last;
835 /^RN/ && last; # separator between references ! LP 07/25/2000
836 #/^SQ/ && last; # there may be sequences without CC lines! HL 05/11/2000
837 /^[^R]/ && last; # may be the safest exit point HL 05/11/2000
838 /^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
839 /^RX MEDLINE=(\d+);\s+PubMed=(\d+);/ && do {$med=$1;$pubmed=$2};
840 /^RA (.*)/ && do { $au .= $au ? " $1" : $1; next;};
841 /^RT (.*)/ && do { $title .= $title ? " $1" : $1; next;};
842 /^RL (.*)/ && do { $loc .= $loc ? " $1" : $1; next;};
843 /^RC (.*)/ && do { $com .= $com ? " $1" : $1; next;};
844 }
845
846 my $ref = new Bio::Annotation::Reference;
847 $au =~ s/;\s*$//g;
848 if( defined $title ) {
849 $title =~ s/;\s*$//g;
850 }
851
852 $ref->start($b1);
853 $ref->end($b2);
854 $ref->authors($au);
855 $ref->title($title);
856 $ref->location($loc);
857 $ref->medline($med);
858 $ref->pubmed($pubmed) if (defined $pubmed);
859 $ref->comment($com);
860 $ref->rp($rp);
861
862 push(@refs,$ref);
863 $$buffer = $_;
864 return \@refs;
865 }
866
867
868 =head2 _read_swissprot_Species
869
870 Title : _read_swissprot_Species
871 Usage :
872 Function: Reads the swissprot Organism species and classification
873 lines.
874 Example :
875 Returns : A Bio::Species object
876 Args :
877
878 =cut
879
880 sub _read_swissprot_Species {
881 my( $self, $buffer ) = @_;
882 my $org;
883
884 $_ = $$buffer;
885 my( $subspecies, $species, $genus, $common, $variant, $ncbi_taxid );
886 my @class;
887 my ($binomial, $descr);
888 my $osline = "";
889
890 while (defined( $_ ||= $self->_readline )) {
891 last unless /^O[SCGX]/;
892 # believe it or not, but OS may come multiple times -- at this time
893 # we can't capture multiple species
894 if(/^OS\s+(\S.+)/ && (! defined($binomial))) {
895 $osline .= " " if $osline;
896 $osline .= $1;
897 if($osline =~ s/(,|, and|\.)$//) {
898 ($binomial, $descr) = $osline =~ /(\S[^\(]+)(.*)/;
899 ($genus, $species, $subspecies) = split(/\s+/, $binomial);
900 $species = "sp." unless $species;
901 while($descr =~ /\(([^\)]+)\)/g) {
902 my $item = $1;
903 # strain etc may not necessarily come first (yes, swissprot
904 # is messy)
905 if((! defined($variant)) &&
906 (($item =~ /(^|[^\(\w])([Ss]train|isolate|serogroup|serotype|subtype|clone)\b/) ||
907 ($item =~ /^(biovar|pv\.|type\s+)/))) {
908 $variant = $item;
909 } elsif($item =~ s/^subsp\.\s+//) {
910 if(! $subspecies) {
911 $subspecies = $item;
912 } elsif(! $variant) {
913 $variant = $item;
914 }
915 } elsif(! defined($common)) {
916 # we're only interested in the first common name
917 $common = $item;
918 if((index($common, '(') >= 0) &&
919 (index($common, ')') < 0)) {
920 $common .= ')';
921 }
922 }
923 }
924 }
925 }
926 elsif (s/^OC\s+//) {
927 push(@class, split /[\;\.]\s*/);
928 if($class[0] =~ /viruses/i) {
929 # viruses have different OS/OC syntax
930 my @virusnames = split(/\s+/, $binomial);
931 $species = (@virusnames > 1) ? pop(@virusnames) : '';
932 $genus = join(" ", @virusnames);
933 $subspecies = undef;
934 }
935 }
936 elsif (/^OG\s+(.*)/) {
937 $org = $1;
938 }
939 elsif (/^OX\s+(.*)/ && (! defined($ncbi_taxid))) {
940 my $taxstring = $1;
941 # we only keep the first one and ignore all others
942 if ($taxstring =~ /NCBI_TaxID=([\w\d]+)/) {
943 $ncbi_taxid = $1;
944 } else {
945 $self->throw("$taxstring doesn't look like NCBI_TaxID");
946 }
947 }
948
949 $_ = undef; # Empty $_ to trigger read of next line
950 }
951
952 $$buffer = $_;
953
954 # Don't make a species object if it is "Unknown" or "None"
955 return if $genus =~ /^(Unknown|None)$/i;
956
957 if ($class[$#class] eq $genus) {
958 push( @class, $species );
959 } else {
960 push( @class, $genus, $species );
961 }
962
963 @class = reverse @class;
964
965 my $taxon = Bio::Species->new();
966 $taxon->classification( \@class, "FORCE" ); # no name validation please
967 $taxon->common_name( $common ) if $common;
968 $taxon->sub_species( $subspecies ) if $subspecies;
969 $taxon->organelle ( $org ) if $org;
970 $taxon->ncbi_taxid ( $ncbi_taxid ) if $ncbi_taxid;
971 $taxon->variant($variant) if $variant;
972
973 # done
974 return $taxon;
975 }
976
977 =head2 _filehandle
978
979 Title : _filehandle
980 Usage : $obj->_filehandle($newval)
981 Function:
982 Example :
983 Returns : value of _filehandle
984 Args : newvalue (optional)
985
986
987 =cut
988
989 # inherited from SeqIO.pm ! HL 05/11/2000
990
991 =head2 _read_FTHelper_swissprot
992
993 Title : _read_FTHelper_swissprot
994 Usage : _read_FTHelper_swissprot(\$buffer)
995 Function: reads the next FT key line
996 Example :
997 Returns : Bio::SeqIO::FTHelper object
998 Args : filehandle and reference to a scalar
999
1000
1001 =cut
1002
1003 sub _read_FTHelper_swissprot {
1004 # initial version implemented by HL 05/10/2000
1005 # FIXME this may not be perfect, so please review
1006 my ($self,$buffer) = @_;
1007 my ($key, # The key of the feature
1008 $loc, # The location line from the feature
1009 $desc, # The descriptive text
1010 );
1011
1012 if ($$buffer =~ /^FT (\w+)\s+([\d\?\<]+)\s+([\d\?\>]+)\s*(.*)$/) {
1013 $key = $1;
1014 my $loc1 = $2;
1015 my $loc2 = $3;
1016 $loc = "$loc1..$loc2";
1017 if($4 && (length($4) > 0)) {
1018 $desc = $4;
1019 chomp($desc);
1020 } else {
1021 $desc = "";
1022 }
1023 # Read all the continuation lines up to the next feature
1024 while (defined($_ = $self->_readline) && /^FT\s{20,}(\S.*)$/) {
1025 $desc .= $1;
1026 chomp($desc);
1027 }
1028 $desc =~ s/\.$//;
1029 } else {
1030 # No feature key. What's this?
1031 $self->warn("No feature key in putative feature table line: $_");
1032 return;
1033 }
1034
1035 # Put the first line of the next feature into the buffer
1036 $$buffer = $_;
1037
1038 # Make the new FTHelper object
1039 my $out = new Bio::SeqIO::FTHelper(-verbose => $self->verbose());
1040 $out->key($key);
1041 $out->loc($loc);
1042
1043 # store the description if there is one
1044 if($desc && (length($desc) > 0)) {
1045 $out->field->{"description"} ||= [];
1046 push(@{$out->field->{"description"}}, $desc);
1047 }
1048 return $out;
1049 }
1050
1051
1052 =head2 _write_line_swissprot
1053
1054 Title : _write_line_swissprot
1055 Usage :
1056 Function: internal function
1057 Example :
1058 Returns :
1059 Args :
1060
1061
1062 =cut
1063
1064 sub _write_line_swissprot{
1065 my ($self,$pre1,$pre2,$line,$length) = @_;
1066
1067 $length || die "Miscalled write_line_swissprot without length. Programming error!";
1068 my $subl = $length - length $pre2;
1069 my $linel = length $line;
1070 my $i;
1071
1072 my $sub = substr($line,0,$length - length $pre1);
1073
1074 $self->_print( "$pre1$sub\n");
1075
1076 for($i= ($length - length $pre1);$i < $linel;) {
1077 $sub = substr($line,$i,($subl));
1078 $self->_print( "$pre2$sub\n");
1079 $i += $subl;
1080 }
1081
1082 }
1083
1084 =head2 _write_line_swissprot_regex
1085
1086 Title : _write_line_swissprot_regex
1087 Usage :
1088 Function: internal function for writing lines of specified
1089 length, with different first and the next line
1090 left hand headers and split at specific points in the
1091 text
1092 Example :
1093 Returns : nothing
1094 Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1095
1096
1097 =cut
1098
1099 sub _write_line_swissprot_regex {
1100 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1101
1102 #print STDOUT "Going to print with $line!\n";
1103
1104 $length || die "Miscalled write_line_swissprot without length. Programming error!";
1105
1106 if( length $pre1 != length $pre2 ) {
1107 print STDERR "len 1 is ", length $pre1, " len 2 is ", length $pre2, "\n";
1108 die "Programming error - cannot called write_line_swissprot_regex with different length \npre1 ($pre1) and \npre2 ($pre2) tags!";
1109 }
1110
1111 my $subl = $length - (length $pre1) -1 ;
1112 my @lines;
1113
1114 while($line =~ m/(.{1,$subl})($regex)/g) {
1115 push(@lines, $1.$2);
1116 }
1117
1118 my $s = shift @lines;
1119 $self->_print( "$pre1$s\n");
1120 foreach my $s ( @lines ) {
1121 $self->_print( "$pre2$s\n");
1122 }
1123 }
1124
1125 =head2 _post_sort
1126
1127 Title : _post_sort
1128 Usage : $obj->_post_sort($newval)
1129 Function:
1130 Returns : value of _post_sort
1131 Args : newvalue (optional)
1132
1133
1134 =cut
1135
1136 sub _post_sort{
1137 my $obj = shift;
1138 if( @_ ) {
1139 my $value = shift;
1140 $obj->{'_post_sort'} = $value;
1141 }
1142 return $obj->{'_post_sort'};
1143
1144 }
1145
1146 =head2 _show_dna
1147
1148 Title : _show_dna
1149 Usage : $obj->_show_dna($newval)
1150 Function:
1151 Returns : value of _show_dna
1152 Args : newvalue (optional)
1153
1154
1155 =cut
1156
1157 sub _show_dna{
1158 my $obj = shift;
1159 if( @_ ) {
1160 my $value = shift;
1161 $obj->{'_show_dna'} = $value;
1162 }
1163 return $obj->{'_show_dna'};
1164
1165 }
1166
1167 =head2 _id_generation_func
1168
1169 Title : _id_generation_func
1170 Usage : $obj->_id_generation_func($newval)
1171 Function:
1172 Returns : value of _id_generation_func
1173 Args : newvalue (optional)
1174
1175
1176 =cut
1177
1178 sub _id_generation_func{
1179 my $obj = shift;
1180 if( @_ ) {
1181 my $value = shift;
1182 $obj->{'_id_generation_func'} = $value;
1183 }
1184 return $obj->{'_id_generation_func'};
1185
1186 }
1187
1188 =head2 _ac_generation_func
1189
1190 Title : _ac_generation_func
1191 Usage : $obj->_ac_generation_func($newval)
1192 Function:
1193 Returns : value of _ac_generation_func
1194 Args : newvalue (optional)
1195
1196
1197 =cut
1198
1199 sub _ac_generation_func{
1200 my $obj = shift;
1201 if( @_ ) {
1202 my $value = shift;
1203 $obj->{'_ac_generation_func'} = $value;
1204 }
1205 return $obj->{'_ac_generation_func'};
1206
1207 }
1208
1209 =head2 _sv_generation_func
1210
1211 Title : _sv_generation_func
1212 Usage : $obj->_sv_generation_func($newval)
1213 Function:
1214 Returns : value of _sv_generation_func
1215 Args : newvalue (optional)
1216
1217
1218 =cut
1219
1220 sub _sv_generation_func{
1221 my $obj = shift;
1222 if( @_ ) {
1223 my $value = shift;
1224 $obj->{'_sv_generation_func'} = $value;
1225 }
1226 return $obj->{'_sv_generation_func'};
1227
1228 }
1229
1230 =head2 _kw_generation_func
1231
1232 Title : _kw_generation_func
1233 Usage : $obj->_kw_generation_func($newval)
1234 Function:
1235 Returns : value of _kw_generation_func
1236 Args : newvalue (optional)
1237
1238
1239 =cut
1240
1241 sub _kw_generation_func{
1242 my $obj = shift;
1243 if( @_ ) {
1244 my $value = shift;
1245 $obj->{'_kw_generation_func'} = $value;
1246 }
1247 return $obj->{'_kw_generation_func'};
1248
1249 }
1250
1251 1;