Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/SeqIO/swiss.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: swiss.pm,v 1.66.2.4 2003/09/13 22:16:43 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::SeqIO::swiss | |
4 # | |
5 # Cared for by Elia Stupka <elia@tll.org.sg> | |
6 # | |
7 # Copyright Elia Stupka | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SeqIO::swiss - Swissprot sequence input/output stream | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 It is probably best not to use this object directly, but | |
20 rather go through the SeqIO handler system. Go: | |
21 | |
22 $stream = Bio::SeqIO->new(-file => $filename, -format => 'swiss'); | |
23 | |
24 while ( my $seq = $stream->next_seq() ) { | |
25 # do something with $seq | |
26 } | |
27 | |
28 =head1 DESCRIPTION | |
29 | |
30 This object can transform Bio::Seq objects to and from swissprot flat | |
31 file databases. | |
32 | |
33 There is a lot of flexibility here about how to dump things which I need | |
34 to document fully. | |
35 | |
36 | |
37 =head2 Optional functions | |
38 | |
39 =over 3 | |
40 | |
41 =item _show_dna() | |
42 | |
43 (output only) shows the dna or not | |
44 | |
45 =item _post_sort() | |
46 | |
47 (output only) provides a sorting func which is applied to the FTHelpers | |
48 before printing | |
49 | |
50 =item _id_generation_func() | |
51 | |
52 This is function which is called as | |
53 | |
54 print "ID ", $func($seq), "\n"; | |
55 | |
56 To generate the ID line. If it is not there, it generates a sensible ID | |
57 line using a number of tools. | |
58 | |
59 If you want to output annotations in swissprot format they need to be | |
60 stored in a Bio::Annotation::Collection object which is accessible | |
61 through the Bio::SeqI interface method L<annotation()|annotation>. | |
62 | |
63 The following are the names of the keys which are polled from a | |
64 L<Bio::Annotation::Collection> object. | |
65 | |
66 reference - Should contain Bio::Annotation::Reference objects | |
67 comment - Should contain Bio::Annotation::Comment objects | |
68 dblink - Should contain Bio::Annotation::DBLink objects | |
69 gene_name - Should contain Bio::Annotation::SimpleValue object | |
70 | |
71 =back | |
72 | |
73 =head1 FEEDBACK | |
74 | |
75 =head2 Mailing Lists | |
76 | |
77 User feedback is an integral part of the evolution of this | |
78 and other Bioperl modules. Send your comments and suggestions preferably | |
79 to one of the Bioperl mailing lists. | |
80 Your participation is much appreciated. | |
81 | |
82 bioperl-l@bioperl.org - General discussion | |
83 http://bio.perl.org/MailList.html - About the mailing lists | |
84 | |
85 =head2 Reporting Bugs | |
86 | |
87 Report bugs to the Bioperl bug tracking system to help us keep track | |
88 the bugs and their resolution. | |
89 Bug reports can be submitted via email or the web: | |
90 | |
91 bioperl-bugs@bio.perl.org | |
92 http://bugzilla.bioperl.org/ | |
93 | |
94 =head1 AUTHOR - Elia Stupka | |
95 | |
96 Email elia@tll.org.sg | |
97 | |
98 Describe contact details here | |
99 | |
100 =head1 APPENDIX | |
101 | |
102 The rest of the documentation details each of the object | |
103 methods. Internal methods are usually preceded with a _ | |
104 | |
105 =cut | |
106 | |
107 | |
108 # Let the code begin... | |
109 | |
110 | |
111 package Bio::SeqIO::swiss; | |
112 use vars qw(@ISA); | |
113 use strict; | |
114 use Bio::SeqIO; | |
115 use Bio::SeqIO::FTHelper; | |
116 use Bio::SeqFeature::Generic; | |
117 use Bio::Species; | |
118 use Bio::Tools::SeqStats; | |
119 use Bio::Seq::SeqFactory; | |
120 use Bio::Annotation::Collection; | |
121 use Bio::Annotation::Comment; | |
122 use Bio::Annotation::Reference; | |
123 use Bio::Annotation::DBLink; | |
124 use Bio::Annotation::SimpleValue; | |
125 use Bio::Annotation::StructuredValue; | |
126 | |
127 @ISA = qw(Bio::SeqIO); | |
128 | |
129 | |
130 sub _initialize { | |
131 my($self,@args) = @_; | |
132 $self->SUPER::_initialize(@args); | |
133 | |
134 # hash for functions for decoding keys. | |
135 $self->{'_func_ftunit_hash'} = {}; | |
136 $self->_show_dna(1); # sets this to one by default. People can change it | |
137 if( ! defined $self->sequence_factory ) { | |
138 $self->sequence_factory(new Bio::Seq::SeqFactory | |
139 (-verbose => $self->verbose(), | |
140 -type => 'Bio::Seq::RichSeq')); | |
141 } | |
142 } | |
143 | |
144 =head2 next_seq | |
145 | |
146 Title : next_seq | |
147 Usage : $seq = $stream->next_seq() | |
148 Function: returns the next sequence in the stream | |
149 Returns : Bio::Seq object | |
150 Args : | |
151 | |
152 | |
153 =cut | |
154 | |
155 sub next_seq { | |
156 my ($self,@args) = @_; | |
157 my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div, | |
158 $date,$comment,@date_arr); | |
159 | |
160 my $genename = ""; | |
161 my ($annotation, %params, @features) = ( new Bio::Annotation::Collection); | |
162 | |
163 $line = $self->_readline; | |
164 | |
165 if( !defined $line) { | |
166 return undef; # no throws - end of file | |
167 } | |
168 | |
169 if( $line =~ /^\s+$/ ) { | |
170 while( defined ($line = $self->_readline) ) { | |
171 $line =~ /\S/ && last; | |
172 } | |
173 } | |
174 if( !defined $line ) { | |
175 return undef; # end of file | |
176 } | |
177 | |
178 # fixed to allow _DIVISION to be optional for bug #946 | |
179 # see bug report for more information | |
180 $line =~ /^ID\s+([^\s_]+)(_([^\s_]+))?\s+([^\s;]+);\s+([^\s;]+);/ | |
181 || $self->throw("swissprot stream with no ID. Not swissprot in my book"); | |
182 | |
183 if( $3 ) { | |
184 $name = "$1$2"; | |
185 $params{'-division'} = $3; | |
186 } else { | |
187 $name = $1; | |
188 $params{'-division'} = 'UNK'; | |
189 $params{'-primary_id'} = $1; | |
190 } | |
191 $params{'-alphabet'} = 'protein'; | |
192 # this is important to have the id for display in e.g. FTHelper, otherwise | |
193 # you won't know which entry caused an error | |
194 $params{'-display_id'} = $name; | |
195 | |
196 my $buffer = $line; | |
197 | |
198 BEFORE_FEATURE_TABLE : | |
199 until( !defined ($buffer) ) { | |
200 $_ = $buffer; | |
201 | |
202 # Exit at start of Feature table | |
203 last if /^FT/; | |
204 # and at the sequence at the latest HL 05/11/2000 | |
205 last if /^SQ/; | |
206 | |
207 # Description line(s) | |
208 if (/^DE\s+(\S.*\S)/) { | |
209 $desc .= $desc ? " $1" : $1; | |
210 } | |
211 #Gene name | |
212 elsif(/^GN\s+(.*)/) { | |
213 $genename .= " " if $genename; | |
214 $genename .= $1; | |
215 # has GN terminated yet? | |
216 if($genename =~ s/[\. ]+$//) { | |
217 my $gn = Bio::Annotation::StructuredValue->new(); | |
218 foreach my $gene (split(/ AND /, $genename)) { | |
219 $gene =~ s/^\(//; | |
220 $gene =~ s/\)$//; | |
221 $gn->add_value([-1,-1], split(/ OR /, $gene)); | |
222 } | |
223 $annotation->add_Annotation('gene_name',$gn, | |
224 "Bio::Annotation::SimpleValue"); | |
225 } | |
226 } | |
227 #accession number(s) | |
228 elsif( /^AC\s+(.+)/) { | |
229 my @accs = split(/[; ]+/, $1); # allow space in addition | |
230 $params{'-accession_number'} = shift @accs | |
231 unless defined $params{'-accession_number'}; | |
232 push @{$params{'-secondary_accessions'}}, @accs; | |
233 } | |
234 #version number | |
235 elsif( /^SV\s+(\S+);?/ ) { | |
236 my $sv = $1; | |
237 $sv =~ s/\;//; | |
238 $params{'-seq_version'} = $sv; | |
239 } | |
240 #date | |
241 elsif( /^DT\s+(.*)/ ) { | |
242 my $date = $1; | |
243 $date =~ s/\;//; | |
244 $date =~ s/\s+$//; | |
245 push @{$params{'-dates'}}, $date; | |
246 } | |
247 # Organism name and phylogenetic information | |
248 elsif (/^O[SCG]/) { | |
249 my $species = $self->_read_swissprot_Species(\$buffer); | |
250 $params{'-species'}= $species; | |
251 # now we are one line ahead -- so continue without reading the next | |
252 # line HL 05/11/2000 | |
253 next; | |
254 } | |
255 # References | |
256 elsif (/^R/) { | |
257 my $refs = $self->_read_swissprot_References(\$buffer); | |
258 | |
259 foreach my $r (@$refs) { | |
260 $annotation->add_Annotation('reference',$r); | |
261 } | |
262 # now we are one line ahead -- so continue without reading the next | |
263 # line HL 05/11/2000 | |
264 next; | |
265 } | |
266 #Comments | |
267 elsif (/^CC\s{3}(.*)/) { | |
268 $comment .= $1; | |
269 $comment .= "\n"; | |
270 while (defined ($buffer = $self->_readline)) { | |
271 if ($buffer =~ /^CC\s{3}(.*)/) { | |
272 $comment .= $1; | |
273 $comment .= "\n"; | |
274 } | |
275 else { | |
276 last; | |
277 } | |
278 } | |
279 my $commobj = Bio::Annotation::Comment->new(); | |
280 # note: don't try to process comments here -- they may contain | |
281 # structure. LP 07/30/2000 | |
282 $commobj->text($comment); | |
283 $annotation->add_Annotation('comment',$commobj); | |
284 $comment = ""; | |
285 # now we are one line ahead -- so continue without reading the next | |
286 # line HL 05/11/2000 | |
287 next; | |
288 } | |
289 #DBLinks | |
290 elsif (/^DR\s+(\S+)\;\s+(\S+)\;\s+(\S+)[\;\.](.*)$/) { | |
291 my $dblinkobj = Bio::Annotation::DBLink->new(); | |
292 $dblinkobj->database($1); | |
293 $dblinkobj->primary_id($2); | |
294 $dblinkobj->optional_id($3); | |
295 my $comment = $4; | |
296 if(length($comment) > 0) { | |
297 # edit comment to get rid of leading space and trailing dot | |
298 if( $comment =~ /^\s*(\S+)\./ ) { | |
299 $dblinkobj->comment($1); | |
300 } else { | |
301 $dblinkobj->comment($comment); | |
302 } | |
303 } | |
304 $annotation->add_Annotation('dblink',$dblinkobj); | |
305 } | |
306 #keywords | |
307 elsif( /^KW\s+(.*)$/ ) { | |
308 my @kw = split(/\s*\;\s*/,$1); | |
309 defined $kw[-1] && $kw[-1] =~ s/\.$//; | |
310 push @{$params{'-keywords'}}, @kw; | |
311 } | |
312 | |
313 | |
314 # Get next line. Getting here assumes that we indeed need to read the | |
315 # line. | |
316 $buffer = $self->_readline; | |
317 } | |
318 | |
319 $buffer = $_; | |
320 | |
321 FEATURE_TABLE : | |
322 # if there is no feature table, or if we've got beyond, exit loop or don't | |
323 # even enter HL 05/11/2000 | |
324 while (defined ($buffer) && ($buffer =~ /^FT/)) { | |
325 my $ftunit = $self->_read_FTHelper_swissprot(\$buffer); | |
326 | |
327 # process ftunit | |
328 # when parsing of the line fails we get undef returned | |
329 if($ftunit) { | |
330 push(@features, | |
331 $ftunit->_generic_seqfeature($self->location_factory(), | |
332 $params{'-seqid'}, "SwissProt")); | |
333 } else { | |
334 $self->warn("failed to parse feature table line for seq " . | |
335 $params{'-display_id'}); | |
336 } | |
337 } | |
338 if( $buffer !~ /^SQ/ ) { | |
339 while( defined($_ = $self->_readline) ) { | |
340 /^SQ/ && last; | |
341 } | |
342 } | |
343 $seqc = ""; | |
344 while( defined ($_ = $self->_readline) ) { | |
345 /^\/\// && last; | |
346 $_ = uc($_); | |
347 s/[^A-Za-z]//g; | |
348 $seqc .= $_; | |
349 } | |
350 | |
351 my $seq= $self->sequence_factory->create | |
352 (-verbose => $self->verbose, | |
353 %params, | |
354 -seq => $seqc, | |
355 -desc => $desc, | |
356 -features => \@features, | |
357 -annotation => $annotation, | |
358 ); | |
359 | |
360 # The annotation doesn't get added by the contructor | |
361 $seq->annotation($annotation); | |
362 | |
363 return $seq; | |
364 } | |
365 | |
366 =head2 write_seq | |
367 | |
368 Title : write_seq | |
369 Usage : $stream->write_seq($seq) | |
370 Function: writes the $seq object (must be seq) to the stream | |
371 Returns : 1 for success and 0 for error | |
372 Args : array of 1 to n Bio::SeqI objects | |
373 | |
374 | |
375 =cut | |
376 | |
377 sub write_seq { | |
378 my ($self,@seqs) = @_; | |
379 foreach my $seq ( @seqs ) { | |
380 $self->throw("Attempting to write with no seq!") unless defined $seq; | |
381 | |
382 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { | |
383 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); | |
384 } | |
385 | |
386 my $i; | |
387 my $str = $seq->seq; | |
388 | |
389 my $mol; | |
390 my $div; | |
391 my $len = $seq->length(); | |
392 | |
393 if ( !$seq->can('division') || ! defined ($div = $seq->division()) ) { | |
394 $div = 'UNK'; | |
395 } | |
396 | |
397 if( ! $seq->can('alphabet') || ! defined ($mol = $seq->alphabet) ) { | |
398 $mol = 'XXX'; | |
399 } | |
400 | |
401 my $temp_line; | |
402 if( $self->_id_generation_func ) { | |
403 $temp_line = &{$self->_id_generation_func}($seq); | |
404 } else { | |
405 #$temp_line = sprintf ("%10s STANDARD; %3s; %d AA.", | |
406 # $seq->primary_id()."_".$div,$mol,$len); | |
407 # Reconstructing the ID relies heavily upon the input source having | |
408 # been in a format that is parsed as this routine expects it -- that is, | |
409 # by this module itself. This is bad, I think, and immediately breaks | |
410 # if e.g. the Bio::DB::GenPept module is used as input. | |
411 # Hence, switch to display_id(); _every_ sequence is supposed to have | |
412 # this. HL 2000/09/03 | |
413 $mol =~ s/protein/PRT/; | |
414 $temp_line = sprintf ("%10s STANDARD; %3s; %d AA.", | |
415 $seq->display_id(), $mol, $len); | |
416 } | |
417 | |
418 $self->_print( "ID $temp_line\n"); | |
419 | |
420 # if there, write the accession line | |
421 local($^W) = 0; # supressing warnings about uninitialized fields | |
422 | |
423 if( $self->_ac_generation_func ) { | |
424 $temp_line = &{$self->_ac_generation_func}($seq); | |
425 $self->_print( "AC $temp_line\n"); | |
426 } else { | |
427 if ($seq->can('accession_number') ) { | |
428 $self->_print("AC ",$seq->accession_number,";"); | |
429 if ($seq->can('get_secondary_accessions') ) { | |
430 foreach my $sacc ($seq->get_secondary_accessions) { | |
431 $self->_print(" ",$sacc,";"); | |
432 } | |
433 $self->_print("\n"); | |
434 } | |
435 else { | |
436 $self->_print("\n"); | |
437 } | |
438 } | |
439 # otherwise - cannot print <sigh> | |
440 } | |
441 | |
442 # Date lines | |
443 | |
444 if( $seq->can('get_dates') ) { | |
445 foreach my $dt ( $seq->get_dates() ) { | |
446 $self->_write_line_swissprot_regex("DT ","DT ", | |
447 $dt,"\\s\+\|\$",80); | |
448 } | |
449 } | |
450 | |
451 #Definition lines | |
452 $self->_write_line_swissprot_regex("DE ","DE ",$seq->desc(),"\\s\+\|\$",80); | |
453 | |
454 #Gene name | |
455 if ((my @genes = $seq->annotation->get_Annotations('gene_name') ) ) { | |
456 $self->_print("GN ", | |
457 join(' OR ', | |
458 map { | |
459 $_->isa("Bio::Annotation::StructuredValue") ? | |
460 $_->value(-joins => [" AND ", " OR "]) : | |
461 $_->value(); | |
462 } @genes), | |
463 ".\n"); | |
464 } | |
465 | |
466 # Organism lines | |
467 if ($seq->can('species') && (my $spec = $seq->species)) { | |
468 my($species, @class) = $spec->classification(); | |
469 my $genus = $class[0]; | |
470 my $OS = "$genus $species"; | |
471 if ($class[$#class] =~ /viruses/i) { | |
472 # different OS / OC syntax for viruses LP 09/16/2000 | |
473 shift @class; | |
474 } | |
475 if (my $ssp = $spec->sub_species) { | |
476 $OS .= " $ssp"; | |
477 } | |
478 foreach (($spec->variant, $spec->common_name)) { | |
479 $OS .= " ($_)" if $_; | |
480 } | |
481 $self->_print( "OS $OS.\n"); | |
482 my $OC = join('; ', reverse(@class)) .'.'; | |
483 $self->_write_line_swissprot_regex("OC ","OC ",$OC,"\; \|\$",80); | |
484 if ($spec->organelle) { | |
485 $self->_write_line_swissprot_regex("OG ","OG ",$spec->organelle,"\; \|\$",80); | |
486 } | |
487 if ($spec->ncbi_taxid) { | |
488 $self->_print("OX NCBI_TaxID=".$spec->ncbi_taxid.";\n"); | |
489 } | |
490 } | |
491 | |
492 # Reference lines | |
493 my $t = 1; | |
494 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { | |
495 $self->_print( "RN [$t]\n"); | |
496 # changed by lorenz 08/03/00 | |
497 # j.gilbert and h.lapp agreed that the rp line in swissprot seems | |
498 # more like a comment than a parseable value, so print it as is | |
499 if ($ref->rp) { | |
500 $self->_write_line_swissprot_regex("RP ","RP ",$ref->rp, | |
501 "\\s\+\|\$",80); | |
502 } | |
503 if ($ref->comment) { | |
504 $self->_write_line_swissprot_regex("RC ","RC ",$ref->comment, | |
505 "\\s\+\|\$",80); | |
506 } | |
507 if ($ref->medline) { | |
508 # new RX format in swissprot LP 09/17/00 | |
509 if ($ref->pubmed) { | |
510 $self->_write_line_swissprot_regex("RX ","RX ", | |
511 "MEDLINE=".$ref->medline. | |
512 "; PubMed=".$ref->pubmed.";", | |
513 "\\s\+\|\$",80); | |
514 } else { | |
515 $self->_write_line_swissprot_regex("RX MEDLINE; ","RX MEDLINE; ", | |
516 $ref->medline.".","\\s\+\|\$",80); | |
517 } | |
518 } | |
519 my $author = $ref->authors .';' if($ref->authors); | |
520 my $title = $ref->title .';' if( $ref->title); | |
521 | |
522 $self->_write_line_swissprot_regex("RA ","RA ",$author,"\\s\+\|\$",80); | |
523 $self->_write_line_swissprot_regex("RT ","RT ",$title,"\\s\+\|\$",80); | |
524 $self->_write_line_swissprot_regex("RL ","RL ",$ref->location,"\\s\+\|\$",80); | |
525 $t++; | |
526 } | |
527 | |
528 # Comment lines | |
529 | |
530 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { | |
531 foreach my $cline (split ("\n", $comment->text)) { | |
532 while (length $cline > 74) { | |
533 $self->_print("CC ",(substr $cline,0,74),"\n"); | |
534 $cline = substr $cline,74; | |
535 } | |
536 $self->_print("CC ",$cline,"\n"); | |
537 } | |
538 } | |
539 | |
540 foreach my $dblink ( $seq->annotation->get_Annotations('dblink') ) | |
541 { | |
542 if (defined($dblink->comment)&&($dblink->comment)) { | |
543 $self->_print("DR ",$dblink->database,"; ",$dblink->primary_id,"; ", | |
544 $dblink->optional_id,"; ",$dblink->comment,".\n"); | |
545 } elsif($dblink->optional_id) { | |
546 $self->_print("DR ",$dblink->database,"; ", | |
547 $dblink->primary_id,"; ", | |
548 $dblink->optional_id,".\n"); | |
549 } | |
550 else { | |
551 $self->_print("DR ",$dblink->database, | |
552 "; ",$dblink->primary_id,"; ","-.\n"); | |
553 } | |
554 } | |
555 | |
556 # if there, write the kw line | |
557 { | |
558 my( $kw ); | |
559 if( my $func = $self->_kw_generation_func ) { | |
560 $kw = &{$func}($seq); | |
561 } elsif( $seq->can('keywords') ) { | |
562 $kw = $seq->keywords; | |
563 if( ref($kw) =~ /ARRAY/i ) { | |
564 $kw = join("; ", @$kw); | |
565 } | |
566 $kw .= '.' if( $kw !~ /\.$/ ); | |
567 } | |
568 $self->_write_line_swissprot_regex("KW ","KW ", | |
569 $kw, "\\s\+\|\$",80); | |
570 } | |
571 | |
572 #Check if there are seqfeatures before printing the FT line | |
573 my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : (); | |
574 if ($feats[0]) { | |
575 if( defined $self->_post_sort ) { | |
576 | |
577 # we need to read things into an array. Process. Sort them. Print 'em | |
578 | |
579 my $post_sort_func = $self->_post_sort(); | |
580 my @fth; | |
581 | |
582 foreach my $sf ( @feats ) { | |
583 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq)); | |
584 } | |
585 @fth = sort { &$post_sort_func($a,$b) } @fth; | |
586 | |
587 foreach my $fth ( @fth ) { | |
588 $self->_print_swissprot_FTHelper($fth); | |
589 } | |
590 } else { | |
591 # not post sorted. And so we can print as we get them. | |
592 # lower memory load... | |
593 | |
594 foreach my $sf ( @feats ) { | |
595 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq); | |
596 foreach my $fth ( @fth ) { | |
597 if( ! $fth->isa('Bio::SeqIO::FTHelper') ) { | |
598 $sf->throw("Cannot process FTHelper... $fth"); | |
599 } | |
600 | |
601 $self->_print_swissprot_FTHelper($fth); | |
602 } | |
603 } | |
604 } | |
605 | |
606 if( $self->_show_dna() == 0 ) { | |
607 return; | |
608 } | |
609 } | |
610 # finished printing features. | |
611 | |
612 # molecular weight | |
613 my $mw = ${Bio::Tools::SeqStats->get_mol_wt($seq->primary_seq)}[0]; | |
614 # checksum | |
615 # was crc32 checksum, changed it to crc64 | |
616 my $crc64 = $self->_crc64(\$str); | |
617 $self->_print( sprintf("SQ SEQUENCE %4d AA; %d MW; %16s CRC64;\n", | |
618 $len,$mw,$crc64)); | |
619 $self->_print( " "); | |
620 my $linepos; | |
621 for ($i = 0; $i < length($str); $i += 10) { | |
622 $self->_print( substr($str,$i,10), " "); | |
623 $linepos += 11; | |
624 if( ($i+10)%60 == 0 && (($i+10) < length($str))) { | |
625 $self->_print( "\n "); | |
626 } | |
627 } | |
628 $self->_print( "\n//\n"); | |
629 | |
630 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
631 return 1; | |
632 } | |
633 } | |
634 | |
635 # Thanks to James Gilbert for the following two. LP 08/01/2000 | |
636 | |
637 =head2 _generateCRCTable | |
638 | |
639 Title : _generateCRCTable | |
640 Usage : | |
641 Function: | |
642 Example : | |
643 Returns : | |
644 Args : | |
645 | |
646 | |
647 =cut | |
648 | |
649 sub _generateCRCTable { | |
650 # 10001000001010010010001110000100 | |
651 # 32 | |
652 my $poly = 0xEDB88320; | |
653 my ($self) = shift; | |
654 | |
655 $self->{'_crcTable'} = []; | |
656 foreach my $i (0..255) { | |
657 my $crc = $i; | |
658 for (my $j=8; $j > 0; $j--) { | |
659 if ($crc & 1) { | |
660 $crc = ($crc >> 1) ^ $poly; | |
661 } | |
662 else { | |
663 $crc >>= 1; | |
664 } | |
665 } | |
666 ${$self->{'_crcTable'}}[$i] = $crc; | |
667 } | |
668 } | |
669 | |
670 | |
671 =head2 _crc32 | |
672 | |
673 Title : _crc32 | |
674 Usage : | |
675 Function: | |
676 Example : | |
677 Returns : | |
678 Args : | |
679 | |
680 | |
681 =cut | |
682 | |
683 sub _crc32 { | |
684 my( $self, $str ) = @_; | |
685 | |
686 $self->throw("Argument to crc32() must be ref to scalar") | |
687 unless ref($str) eq 'SCALAR'; | |
688 | |
689 $self->_generateCRCTable() unless exists $self->{'_crcTable'}; | |
690 | |
691 my $len = length($$str); | |
692 | |
693 my $crc = 0xFFFFFFFF; | |
694 for (my $i = 0; $i < $len; $i++) { | |
695 # Get upper case value of each letter | |
696 my $int = ord uc substr $$str, $i, 1; | |
697 $crc = (($crc >> 8) & 0x00FFFFFF) ^ | |
698 ${$self->{'_crcTable'}}[ ($crc ^ $int) & 0xFF ]; | |
699 } | |
700 return $crc; | |
701 } | |
702 | |
703 =head2 _crc64 | |
704 | |
705 Title : _crc64 | |
706 Usage : | |
707 Function: | |
708 Example : | |
709 Returns : | |
710 Args : | |
711 | |
712 | |
713 =cut | |
714 | |
715 sub _crc64{ | |
716 my ($self, $sequence) = @_; | |
717 my $POLY64REVh = 0xd8000000; | |
718 my @CRCTableh = 256; | |
719 my @CRCTablel = 256; | |
720 my $initialized; | |
721 | |
722 | |
723 my $seq = $$sequence; | |
724 | |
725 my $crcl = 0; | |
726 my $crch = 0; | |
727 if (!$initialized) { | |
728 $initialized = 1; | |
729 for (my $i=0; $i<256; $i++) { | |
730 my $partl = $i; | |
731 my $parth = 0; | |
732 for (my $j=0; $j<8; $j++) { | |
733 my $rflag = $partl & 1; | |
734 $partl >>= 1; | |
735 $partl |= (1 << 31) if $parth & 1; | |
736 $parth >>= 1; | |
737 $parth ^= $POLY64REVh if $rflag; | |
738 } | |
739 $CRCTableh[$i] = $parth; | |
740 $CRCTablel[$i] = $partl; | |
741 } | |
742 } | |
743 | |
744 foreach (split '', $seq) { | |
745 my $shr = ($crch & 0xFF) << 24; | |
746 my $temp1h = $crch >> 8; | |
747 my $temp1l = ($crcl >> 8) | $shr; | |
748 my $tableindex = ($crcl ^ (unpack "C", $_)) & 0xFF; | |
749 $crch = $temp1h ^ $CRCTableh[$tableindex]; | |
750 $crcl = $temp1l ^ $CRCTablel[$tableindex]; | |
751 } | |
752 my $crc64 = sprintf("%08X%08X", $crch, $crcl); | |
753 | |
754 return $crc64; | |
755 | |
756 } | |
757 | |
758 =head2 _print_swissprot_FTHelper | |
759 | |
760 Title : _print_swissprot_FTHelper | |
761 Usage : | |
762 Function: | |
763 Example : | |
764 Returns : | |
765 Args : | |
766 | |
767 | |
768 =cut | |
769 | |
770 sub _print_swissprot_FTHelper { | |
771 my ($self,$fth,$always_quote) = @_; | |
772 $always_quote ||= 0; | |
773 my ($start,$end) = ('?', '?'); | |
774 | |
775 if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) { | |
776 $fth->warn("$fth is not a FTHelper class. ". | |
777 "Attempting to print, but there could be tears!"); | |
778 } | |
779 | |
780 if( $fth->loc =~ /(\?|\d+|\>\d+|<\d+)?\.\.(\?|\d+|<\d+|>\d+)?/ ) { | |
781 $start = $1 if defined $1; | |
782 $end = $2 if defined $2; | |
783 | |
784 # to_FTString only returns one value when start == end, #JB955 | |
785 # so if no match is found, assume it is both start and end #JB955 | |
786 } else { | |
787 $start = $end = $fth->loc; | |
788 } | |
789 | |
790 my $desc = ""; | |
791 $desc = @{$fth->field->{"description"}}[0]."." | |
792 if exists $fth->field->{"description"}; | |
793 $self->_write_line_swissprot_regex(sprintf("FT %-8s %6s %6s ", | |
794 substr($fth->key,0,8), | |
795 $start,$end), | |
796 "FT ", | |
797 $desc.'.','\s+|$',80); | |
798 } | |
799 #' | |
800 | |
801 =head2 _read_swissprot_References | |
802 | |
803 Title : _read_swissprot_References | |
804 Usage : | |
805 Function: Reads references from swissprot format. Internal function really | |
806 Example : | |
807 Returns : | |
808 Args : | |
809 | |
810 | |
811 =cut | |
812 | |
813 sub _read_swissprot_References{ | |
814 my ($self,$buffer) = @_; | |
815 my (@refs); | |
816 my ($b1, $b2, $rp, $title, $loc, $au, $med, $com, $pubmed); | |
817 | |
818 if ($$buffer !~ /^RP/) { | |
819 $$buffer = $self->_readline; | |
820 } | |
821 if( !defined $$buffer ) { return undef; } | |
822 if( $$buffer =~ /^RP/ ) { | |
823 if ($$buffer =~ /^RP (SEQUENCE OF (\d+)-(\d+).*)/) { | |
824 $rp=$1; | |
825 $b1=$2; | |
826 $b2=$3; | |
827 } | |
828 elsif ($$buffer =~ /^RP (.*)/) { | |
829 $rp=$1; | |
830 } | |
831 | |
832 } | |
833 while( defined ($_ = $self->_readline) ) { | |
834 #/^CC/ && last; | |
835 /^RN/ && last; # separator between references ! LP 07/25/2000 | |
836 #/^SQ/ && last; # there may be sequences without CC lines! HL 05/11/2000 | |
837 /^[^R]/ && last; # may be the safest exit point HL 05/11/2000 | |
838 /^RX MEDLINE;\s+(\d+)/ && do {$med=$1}; | |
839 /^RX MEDLINE=(\d+);\s+PubMed=(\d+);/ && do {$med=$1;$pubmed=$2}; | |
840 /^RA (.*)/ && do { $au .= $au ? " $1" : $1; next;}; | |
841 /^RT (.*)/ && do { $title .= $title ? " $1" : $1; next;}; | |
842 /^RL (.*)/ && do { $loc .= $loc ? " $1" : $1; next;}; | |
843 /^RC (.*)/ && do { $com .= $com ? " $1" : $1; next;}; | |
844 } | |
845 | |
846 my $ref = new Bio::Annotation::Reference; | |
847 $au =~ s/;\s*$//g; | |
848 if( defined $title ) { | |
849 $title =~ s/;\s*$//g; | |
850 } | |
851 | |
852 $ref->start($b1); | |
853 $ref->end($b2); | |
854 $ref->authors($au); | |
855 $ref->title($title); | |
856 $ref->location($loc); | |
857 $ref->medline($med); | |
858 $ref->pubmed($pubmed) if (defined $pubmed); | |
859 $ref->comment($com); | |
860 $ref->rp($rp); | |
861 | |
862 push(@refs,$ref); | |
863 $$buffer = $_; | |
864 return \@refs; | |
865 } | |
866 | |
867 | |
868 =head2 _read_swissprot_Species | |
869 | |
870 Title : _read_swissprot_Species | |
871 Usage : | |
872 Function: Reads the swissprot Organism species and classification | |
873 lines. | |
874 Example : | |
875 Returns : A Bio::Species object | |
876 Args : | |
877 | |
878 =cut | |
879 | |
880 sub _read_swissprot_Species { | |
881 my( $self, $buffer ) = @_; | |
882 my $org; | |
883 | |
884 $_ = $$buffer; | |
885 my( $subspecies, $species, $genus, $common, $variant, $ncbi_taxid ); | |
886 my @class; | |
887 my ($binomial, $descr); | |
888 my $osline = ""; | |
889 | |
890 while (defined( $_ ||= $self->_readline )) { | |
891 last unless /^O[SCGX]/; | |
892 # believe it or not, but OS may come multiple times -- at this time | |
893 # we can't capture multiple species | |
894 if(/^OS\s+(\S.+)/ && (! defined($binomial))) { | |
895 $osline .= " " if $osline; | |
896 $osline .= $1; | |
897 if($osline =~ s/(,|, and|\.)$//) { | |
898 ($binomial, $descr) = $osline =~ /(\S[^\(]+)(.*)/; | |
899 ($genus, $species, $subspecies) = split(/\s+/, $binomial); | |
900 $species = "sp." unless $species; | |
901 while($descr =~ /\(([^\)]+)\)/g) { | |
902 my $item = $1; | |
903 # strain etc may not necessarily come first (yes, swissprot | |
904 # is messy) | |
905 if((! defined($variant)) && | |
906 (($item =~ /(^|[^\(\w])([Ss]train|isolate|serogroup|serotype|subtype|clone)\b/) || | |
907 ($item =~ /^(biovar|pv\.|type\s+)/))) { | |
908 $variant = $item; | |
909 } elsif($item =~ s/^subsp\.\s+//) { | |
910 if(! $subspecies) { | |
911 $subspecies = $item; | |
912 } elsif(! $variant) { | |
913 $variant = $item; | |
914 } | |
915 } elsif(! defined($common)) { | |
916 # we're only interested in the first common name | |
917 $common = $item; | |
918 if((index($common, '(') >= 0) && | |
919 (index($common, ')') < 0)) { | |
920 $common .= ')'; | |
921 } | |
922 } | |
923 } | |
924 } | |
925 } | |
926 elsif (s/^OC\s+//) { | |
927 push(@class, split /[\;\.]\s*/); | |
928 if($class[0] =~ /viruses/i) { | |
929 # viruses have different OS/OC syntax | |
930 my @virusnames = split(/\s+/, $binomial); | |
931 $species = (@virusnames > 1) ? pop(@virusnames) : ''; | |
932 $genus = join(" ", @virusnames); | |
933 $subspecies = undef; | |
934 } | |
935 } | |
936 elsif (/^OG\s+(.*)/) { | |
937 $org = $1; | |
938 } | |
939 elsif (/^OX\s+(.*)/ && (! defined($ncbi_taxid))) { | |
940 my $taxstring = $1; | |
941 # we only keep the first one and ignore all others | |
942 if ($taxstring =~ /NCBI_TaxID=([\w\d]+)/) { | |
943 $ncbi_taxid = $1; | |
944 } else { | |
945 $self->throw("$taxstring doesn't look like NCBI_TaxID"); | |
946 } | |
947 } | |
948 | |
949 $_ = undef; # Empty $_ to trigger read of next line | |
950 } | |
951 | |
952 $$buffer = $_; | |
953 | |
954 # Don't make a species object if it is "Unknown" or "None" | |
955 return if $genus =~ /^(Unknown|None)$/i; | |
956 | |
957 if ($class[$#class] eq $genus) { | |
958 push( @class, $species ); | |
959 } else { | |
960 push( @class, $genus, $species ); | |
961 } | |
962 | |
963 @class = reverse @class; | |
964 | |
965 my $taxon = Bio::Species->new(); | |
966 $taxon->classification( \@class, "FORCE" ); # no name validation please | |
967 $taxon->common_name( $common ) if $common; | |
968 $taxon->sub_species( $subspecies ) if $subspecies; | |
969 $taxon->organelle ( $org ) if $org; | |
970 $taxon->ncbi_taxid ( $ncbi_taxid ) if $ncbi_taxid; | |
971 $taxon->variant($variant) if $variant; | |
972 | |
973 # done | |
974 return $taxon; | |
975 } | |
976 | |
977 =head2 _filehandle | |
978 | |
979 Title : _filehandle | |
980 Usage : $obj->_filehandle($newval) | |
981 Function: | |
982 Example : | |
983 Returns : value of _filehandle | |
984 Args : newvalue (optional) | |
985 | |
986 | |
987 =cut | |
988 | |
989 # inherited from SeqIO.pm ! HL 05/11/2000 | |
990 | |
991 =head2 _read_FTHelper_swissprot | |
992 | |
993 Title : _read_FTHelper_swissprot | |
994 Usage : _read_FTHelper_swissprot(\$buffer) | |
995 Function: reads the next FT key line | |
996 Example : | |
997 Returns : Bio::SeqIO::FTHelper object | |
998 Args : filehandle and reference to a scalar | |
999 | |
1000 | |
1001 =cut | |
1002 | |
1003 sub _read_FTHelper_swissprot { | |
1004 # initial version implemented by HL 05/10/2000 | |
1005 # FIXME this may not be perfect, so please review | |
1006 my ($self,$buffer) = @_; | |
1007 my ($key, # The key of the feature | |
1008 $loc, # The location line from the feature | |
1009 $desc, # The descriptive text | |
1010 ); | |
1011 | |
1012 if ($$buffer =~ /^FT (\w+)\s+([\d\?\<]+)\s+([\d\?\>]+)\s*(.*)$/) { | |
1013 $key = $1; | |
1014 my $loc1 = $2; | |
1015 my $loc2 = $3; | |
1016 $loc = "$loc1..$loc2"; | |
1017 if($4 && (length($4) > 0)) { | |
1018 $desc = $4; | |
1019 chomp($desc); | |
1020 } else { | |
1021 $desc = ""; | |
1022 } | |
1023 # Read all the continuation lines up to the next feature | |
1024 while (defined($_ = $self->_readline) && /^FT\s{20,}(\S.*)$/) { | |
1025 $desc .= $1; | |
1026 chomp($desc); | |
1027 } | |
1028 $desc =~ s/\.$//; | |
1029 } else { | |
1030 # No feature key. What's this? | |
1031 $self->warn("No feature key in putative feature table line: $_"); | |
1032 return; | |
1033 } | |
1034 | |
1035 # Put the first line of the next feature into the buffer | |
1036 $$buffer = $_; | |
1037 | |
1038 # Make the new FTHelper object | |
1039 my $out = new Bio::SeqIO::FTHelper(-verbose => $self->verbose()); | |
1040 $out->key($key); | |
1041 $out->loc($loc); | |
1042 | |
1043 # store the description if there is one | |
1044 if($desc && (length($desc) > 0)) { | |
1045 $out->field->{"description"} ||= []; | |
1046 push(@{$out->field->{"description"}}, $desc); | |
1047 } | |
1048 return $out; | |
1049 } | |
1050 | |
1051 | |
1052 =head2 _write_line_swissprot | |
1053 | |
1054 Title : _write_line_swissprot | |
1055 Usage : | |
1056 Function: internal function | |
1057 Example : | |
1058 Returns : | |
1059 Args : | |
1060 | |
1061 | |
1062 =cut | |
1063 | |
1064 sub _write_line_swissprot{ | |
1065 my ($self,$pre1,$pre2,$line,$length) = @_; | |
1066 | |
1067 $length || die "Miscalled write_line_swissprot without length. Programming error!"; | |
1068 my $subl = $length - length $pre2; | |
1069 my $linel = length $line; | |
1070 my $i; | |
1071 | |
1072 my $sub = substr($line,0,$length - length $pre1); | |
1073 | |
1074 $self->_print( "$pre1$sub\n"); | |
1075 | |
1076 for($i= ($length - length $pre1);$i < $linel;) { | |
1077 $sub = substr($line,$i,($subl)); | |
1078 $self->_print( "$pre2$sub\n"); | |
1079 $i += $subl; | |
1080 } | |
1081 | |
1082 } | |
1083 | |
1084 =head2 _write_line_swissprot_regex | |
1085 | |
1086 Title : _write_line_swissprot_regex | |
1087 Usage : | |
1088 Function: internal function for writing lines of specified | |
1089 length, with different first and the next line | |
1090 left hand headers and split at specific points in the | |
1091 text | |
1092 Example : | |
1093 Returns : nothing | |
1094 Args : file handle, first header, second header, text-line, regex for line breaks, total line length | |
1095 | |
1096 | |
1097 =cut | |
1098 | |
1099 sub _write_line_swissprot_regex { | |
1100 my ($self,$pre1,$pre2,$line,$regex,$length) = @_; | |
1101 | |
1102 #print STDOUT "Going to print with $line!\n"; | |
1103 | |
1104 $length || die "Miscalled write_line_swissprot without length. Programming error!"; | |
1105 | |
1106 if( length $pre1 != length $pre2 ) { | |
1107 print STDERR "len 1 is ", length $pre1, " len 2 is ", length $pre2, "\n"; | |
1108 die "Programming error - cannot called write_line_swissprot_regex with different length \npre1 ($pre1) and \npre2 ($pre2) tags!"; | |
1109 } | |
1110 | |
1111 my $subl = $length - (length $pre1) -1 ; | |
1112 my @lines; | |
1113 | |
1114 while($line =~ m/(.{1,$subl})($regex)/g) { | |
1115 push(@lines, $1.$2); | |
1116 } | |
1117 | |
1118 my $s = shift @lines; | |
1119 $self->_print( "$pre1$s\n"); | |
1120 foreach my $s ( @lines ) { | |
1121 $self->_print( "$pre2$s\n"); | |
1122 } | |
1123 } | |
1124 | |
1125 =head2 _post_sort | |
1126 | |
1127 Title : _post_sort | |
1128 Usage : $obj->_post_sort($newval) | |
1129 Function: | |
1130 Returns : value of _post_sort | |
1131 Args : newvalue (optional) | |
1132 | |
1133 | |
1134 =cut | |
1135 | |
1136 sub _post_sort{ | |
1137 my $obj = shift; | |
1138 if( @_ ) { | |
1139 my $value = shift; | |
1140 $obj->{'_post_sort'} = $value; | |
1141 } | |
1142 return $obj->{'_post_sort'}; | |
1143 | |
1144 } | |
1145 | |
1146 =head2 _show_dna | |
1147 | |
1148 Title : _show_dna | |
1149 Usage : $obj->_show_dna($newval) | |
1150 Function: | |
1151 Returns : value of _show_dna | |
1152 Args : newvalue (optional) | |
1153 | |
1154 | |
1155 =cut | |
1156 | |
1157 sub _show_dna{ | |
1158 my $obj = shift; | |
1159 if( @_ ) { | |
1160 my $value = shift; | |
1161 $obj->{'_show_dna'} = $value; | |
1162 } | |
1163 return $obj->{'_show_dna'}; | |
1164 | |
1165 } | |
1166 | |
1167 =head2 _id_generation_func | |
1168 | |
1169 Title : _id_generation_func | |
1170 Usage : $obj->_id_generation_func($newval) | |
1171 Function: | |
1172 Returns : value of _id_generation_func | |
1173 Args : newvalue (optional) | |
1174 | |
1175 | |
1176 =cut | |
1177 | |
1178 sub _id_generation_func{ | |
1179 my $obj = shift; | |
1180 if( @_ ) { | |
1181 my $value = shift; | |
1182 $obj->{'_id_generation_func'} = $value; | |
1183 } | |
1184 return $obj->{'_id_generation_func'}; | |
1185 | |
1186 } | |
1187 | |
1188 =head2 _ac_generation_func | |
1189 | |
1190 Title : _ac_generation_func | |
1191 Usage : $obj->_ac_generation_func($newval) | |
1192 Function: | |
1193 Returns : value of _ac_generation_func | |
1194 Args : newvalue (optional) | |
1195 | |
1196 | |
1197 =cut | |
1198 | |
1199 sub _ac_generation_func{ | |
1200 my $obj = shift; | |
1201 if( @_ ) { | |
1202 my $value = shift; | |
1203 $obj->{'_ac_generation_func'} = $value; | |
1204 } | |
1205 return $obj->{'_ac_generation_func'}; | |
1206 | |
1207 } | |
1208 | |
1209 =head2 _sv_generation_func | |
1210 | |
1211 Title : _sv_generation_func | |
1212 Usage : $obj->_sv_generation_func($newval) | |
1213 Function: | |
1214 Returns : value of _sv_generation_func | |
1215 Args : newvalue (optional) | |
1216 | |
1217 | |
1218 =cut | |
1219 | |
1220 sub _sv_generation_func{ | |
1221 my $obj = shift; | |
1222 if( @_ ) { | |
1223 my $value = shift; | |
1224 $obj->{'_sv_generation_func'} = $value; | |
1225 } | |
1226 return $obj->{'_sv_generation_func'}; | |
1227 | |
1228 } | |
1229 | |
1230 =head2 _kw_generation_func | |
1231 | |
1232 Title : _kw_generation_func | |
1233 Usage : $obj->_kw_generation_func($newval) | |
1234 Function: | |
1235 Returns : value of _kw_generation_func | |
1236 Args : newvalue (optional) | |
1237 | |
1238 | |
1239 =cut | |
1240 | |
1241 sub _kw_generation_func{ | |
1242 my $obj = shift; | |
1243 if( @_ ) { | |
1244 my $value = shift; | |
1245 $obj->{'_kw_generation_func'} = $value; | |
1246 } | |
1247 return $obj->{'_kw_generation_func'}; | |
1248 | |
1249 } | |
1250 | |
1251 1; |