Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/SeqIO/embl.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: embl.pm,v 1.57.2.6 2003/09/14 19:06:51 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::SeqIO::EMBL | |
4 # | |
5 # Cared for by Ewan Birney <birney@ebi.ac.uk> | |
6 # | |
7 # Copyright Ewan Birney | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SeqIO::embl - EMBL sequence input/output stream | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 It is probably best not to use this object directly, but | |
20 rather go through the AnnSeqIO handler system. Go: | |
21 | |
22 $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL'); | |
23 | |
24 while ( (my $seq = $stream->next_seq()) ) { | |
25 # do something with $seq | |
26 } | |
27 | |
28 =head1 DESCRIPTION | |
29 | |
30 This object can transform Bio::Seq objects to and from EMBL flat | |
31 file databases. | |
32 | |
33 There is alot of flexibility here about how to dump things which I need | |
34 to document fully. | |
35 | |
36 There should be a common object that this and genbank share (probably | |
37 with swissprot). Too much of the magic is identical. | |
38 | |
39 =head2 Optional functions | |
40 | |
41 =over 3 | |
42 | |
43 =item _show_dna() | |
44 | |
45 (output only) shows the dna or not | |
46 | |
47 =item _post_sort() | |
48 | |
49 (output only) provides a sorting func which is applied to the FTHelpers | |
50 before printing | |
51 | |
52 =item _id_generation_func() | |
53 | |
54 This is function which is called as | |
55 | |
56 print "ID ", $func($annseq), "\n"; | |
57 | |
58 To generate the ID line. If it is not there, it generates a sensible ID | |
59 line using a number of tools. | |
60 | |
61 If you want to output annotations in embl format they need to be | |
62 stored in a Bio::Annotation::Collection object which is accessible | |
63 through the Bio::SeqI interface method L<annotation()|annotation>. | |
64 | |
65 The following are the names of the keys which are polled from a | |
66 L<Bio::Annotation::Collection> object. | |
67 | |
68 reference - Should contain Bio::Annotation::Reference objects | |
69 comment - Should contain Bio::Annotation::Comment objects | |
70 dblink - Should contain Bio::Annotation::DBLink objects | |
71 | |
72 =back | |
73 | |
74 =head1 FEEDBACK | |
75 | |
76 =head2 Mailing Lists | |
77 | |
78 User feedback is an integral part of the evolution of this | |
79 and other Bioperl modules. Send your comments and suggestions preferably | |
80 to one of the Bioperl mailing lists. | |
81 Your participation is much appreciated. | |
82 | |
83 bioperl-l@bioperl.org - General discussion | |
84 http://www.bioperl.org/MailList.shtml - About the mailing lists | |
85 | |
86 =head2 Reporting Bugs | |
87 | |
88 Report bugs to the Bioperl bug tracking system to help us keep track | |
89 the bugs and their resolution. | |
90 Bug reports can be submitted via email or the web: | |
91 | |
92 bioperl-bugs@bio.perl.org | |
93 http://bugzilla.bioperl.org/ | |
94 | |
95 =head1 AUTHOR - Ewan Birney | |
96 | |
97 Email birney@ebi.ac.uk | |
98 | |
99 Describe contact details here | |
100 | |
101 =head1 APPENDIX | |
102 | |
103 The rest of the documentation details each of the object | |
104 methods. Internal methods are usually preceded with a _ | |
105 | |
106 =cut | |
107 | |
108 | |
109 # Let the code begin... | |
110 | |
111 | |
112 package Bio::SeqIO::embl; | |
113 use vars qw(@ISA); | |
114 use strict; | |
115 use Bio::SeqIO::FTHelper; | |
116 use Bio::SeqFeature::Generic; | |
117 use Bio::Species; | |
118 use Bio::Seq::SeqFactory; | |
119 use Bio::Annotation::Collection; | |
120 use Bio::Annotation::Comment; | |
121 use Bio::Annotation::Reference; | |
122 use Bio::Annotation::DBLink; | |
123 | |
124 @ISA = qw(Bio::SeqIO); | |
125 | |
126 sub _initialize { | |
127 my($self,@args) = @_; | |
128 | |
129 $self->SUPER::_initialize(@args); | |
130 # hash for functions for decoding keys. | |
131 $self->{'_func_ftunit_hash'} = {}; | |
132 $self->_show_dna(1); # sets this to one by default. People can change it | |
133 if( ! defined $self->sequence_factory ) { | |
134 $self->sequence_factory(new Bio::Seq::SeqFactory | |
135 (-verbose => $self->verbose(), | |
136 -type => 'Bio::Seq::RichSeq')); | |
137 } | |
138 } | |
139 | |
140 =head2 next_seq | |
141 | |
142 Title : next_seq | |
143 Usage : $seq = $stream->next_seq() | |
144 Function: returns the next sequence in the stream | |
145 Returns : Bio::Seq object | |
146 Args : | |
147 | |
148 =cut | |
149 | |
150 sub next_seq { | |
151 my ($self,@args) = @_; | |
152 my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div, | |
153 $date, $comment, @date_arr); | |
154 | |
155 my ($annotation, %params, @features) = ( new Bio::Annotation::Collection); | |
156 | |
157 $line = $self->_readline; # This needs to be before the first eof() test | |
158 | |
159 if( !defined $line ) { | |
160 return undef; # no throws - end of file | |
161 } | |
162 | |
163 if( $line =~ /^\s+$/ ) { | |
164 while( defined ($line = $self->_readline) ) { | |
165 $line =~/^\S/ && last; | |
166 } | |
167 } | |
168 if( !defined $line ) { | |
169 return undef; # end of file | |
170 } | |
171 $line =~ /^ID\s+\S+/ || $self->throw("EMBL stream with no ID. Not embl in my book"); | |
172 $line =~ /^ID\s+(\S+)\s+\S+\;\s+([^;]+)\;\s+(\S+)\;/; | |
173 $name = $1; | |
174 $mol = $2; | |
175 $div = $3; | |
176 if(! $name) { | |
177 $name = "unknown id"; | |
178 } | |
179 my $alphabet; | |
180 | |
181 # this is important to have the id for display in e.g. FTHelper, otherwise | |
182 # you won't know which entry caused an error | |
183 if($mol) { | |
184 if ( $mol =~ /circular/ ) { | |
185 $params{'-is_circular'} = 1; | |
186 $mol =~ s|circular ||; | |
187 } | |
188 if (defined $mol ) { | |
189 if ($mol =~ /DNA/) { | |
190 $alphabet='dna'; | |
191 } | |
192 elsif ($mol =~ /RNA/) { | |
193 $alphabet='rna'; | |
194 } | |
195 elsif ($mol =~ /AA/) { | |
196 $alphabet='protein'; | |
197 } | |
198 } | |
199 | |
200 } | |
201 | |
202 # $self->warn("not parsing upper annotation in EMBL file yet!"); | |
203 my $buffer = $line; | |
204 | |
205 local $_; | |
206 | |
207 BEFORE_FEATURE_TABLE : | |
208 until( !defined $buffer ) { | |
209 $_ = $buffer; | |
210 | |
211 # Exit at start of Feature table | |
212 last if /^F[HT]/; | |
213 | |
214 # Description line(s) | |
215 if (/^DE\s+(\S.*\S)/) { | |
216 $desc .= $desc ? " $1" : $1; | |
217 } | |
218 | |
219 #accession number | |
220 if( /^AC\s+(.*)?/ ) { | |
221 my @accs = split(/[; ]+/, $1); # allow space in addition | |
222 $params{'-accession_number'} = shift @accs | |
223 unless defined $params{'-accession_number'}; | |
224 push @{$params{'-secondary_accessions'}}, @accs; | |
225 } | |
226 | |
227 #version number | |
228 if( /^SV\s+\S+\.(\d+);?/ ) { | |
229 my $sv = $1; | |
230 #$sv =~ s/\;//; | |
231 $params{'-seq_version'} = $sv; | |
232 $params{'-version'} = $sv; | |
233 } | |
234 | |
235 #date (NOTE: takes last date line) | |
236 if( /^DT\s+(.+)$/ ) { | |
237 my $date = $1; | |
238 push @{$params{'-dates'}}, $date; | |
239 } | |
240 | |
241 #keywords | |
242 if( /^KW\s+(.*)\S*$/ ) { | |
243 my @kw = split(/\s*\;\s*/,$1); | |
244 push @{$params{'-keywords'}}, @kw; | |
245 } | |
246 | |
247 # Organism name and phylogenetic information | |
248 elsif (/^O[SC]/) { | |
249 my $species = $self->_read_EMBL_Species(\$buffer); | |
250 $params{'-species'}= $species; | |
251 } | |
252 | |
253 # References | |
254 elsif (/^R/) { | |
255 my @refs = $self->_read_EMBL_References(\$buffer); | |
256 foreach my $ref ( @refs ) { | |
257 $annotation->add_Annotation('reference',$ref); | |
258 } | |
259 } | |
260 | |
261 # DB Xrefs | |
262 elsif (/^DR/) { | |
263 my @links = $self->_read_EMBL_DBLink(\$buffer); | |
264 foreach my $dblink ( @links ) { | |
265 $annotation->add_Annotation('dblink',$dblink); | |
266 } | |
267 } | |
268 | |
269 # Comments | |
270 elsif (/^CC\s+(.*)/) { | |
271 $comment .= $1; | |
272 $comment .= " "; | |
273 while (defined ($_ = $self->_readline) ) { | |
274 if (/^CC\s+(.*)/) { | |
275 $comment .= $1; | |
276 $comment .= " "; | |
277 } | |
278 else { | |
279 last; | |
280 } | |
281 } | |
282 my $commobj = Bio::Annotation::Comment->new(); | |
283 $commobj->text($comment); | |
284 $annotation->add_Annotation('comment',$commobj); | |
285 $comment = ""; | |
286 } | |
287 | |
288 # Get next line. | |
289 $buffer = $self->_readline; | |
290 } | |
291 | |
292 while( defined ($_ = $self->_readline) ) { | |
293 /^FT \w/ && last; | |
294 /^SQ / && last; | |
295 /^CO / && last; | |
296 } | |
297 $buffer = $_; | |
298 | |
299 if (defined($buffer) && $buffer =~ /^FT /) { | |
300 until( !defined ($buffer) ) { | |
301 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer); | |
302 # process ftunit | |
303 | |
304 push(@features, | |
305 $ftunit->_generic_seqfeature($self->location_factory(), $name)); | |
306 | |
307 if( $buffer !~ /^FT/ ) { | |
308 last; | |
309 } | |
310 } | |
311 } | |
312 | |
313 | |
314 # skip comments | |
315 while( defined ($buffer) && $buffer =~ /^XX/ ) { | |
316 $buffer = $self->_readline(); | |
317 } | |
318 | |
319 if( $buffer =~ /^CO/ ) { | |
320 until( !defined ($buffer) ) { | |
321 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer); | |
322 # process ftunit | |
323 push(@features, | |
324 $ftunit->_generic_seqfeature($self->location_factory(), | |
325 $name)); | |
326 | |
327 if( $buffer !~ /^CO/ ) { | |
328 last; | |
329 } | |
330 } | |
331 } | |
332 if( $buffer !~ /^SQ/ ) { | |
333 while( defined ($_ = $self->_readline) ) { | |
334 /^SQ/ && last; | |
335 } | |
336 } | |
337 | |
338 $seqc = ""; | |
339 while( defined ($_ = $self->_readline) ) { | |
340 /^\/\// && last; | |
341 $_ = uc($_); | |
342 s/[^A-Za-z]//g; | |
343 $seqc .= $_; | |
344 } | |
345 my $seq = $self->sequence_factory->create | |
346 (-verbose => $self->verbose(), | |
347 -division => $div, | |
348 -seq => $seqc, | |
349 -desc => $desc, | |
350 -display_id => $name, | |
351 -annotation => $annotation, | |
352 -molecule => $mol, | |
353 -alphabet => $alphabet, | |
354 -features => \@features, | |
355 %params); | |
356 | |
357 return $seq; | |
358 } | |
359 | |
360 =head2 write_seq | |
361 | |
362 Title : write_seq | |
363 Usage : $stream->write_seq($seq) | |
364 Function: writes the $seq object (must be seq) to the stream | |
365 Returns : 1 for success and 0 for error | |
366 Args : array of 1 to n Bio::SeqI objects | |
367 | |
368 | |
369 =cut | |
370 | |
371 sub write_seq { | |
372 my ($self,@seqs) = @_; | |
373 | |
374 foreach my $seq ( @seqs ) { | |
375 $self->throw("Attempting to write with no seq!") unless defined $seq; | |
376 unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) { | |
377 $self->warn("$seq is not a SeqI compliant sequence object!") | |
378 if $self->verbose >= 0; | |
379 unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) { | |
380 $self->throw("$seq is not a PrimarySeqI compliant sequence object!"); | |
381 } | |
382 } | |
383 my $str = $seq->seq || ''; | |
384 | |
385 my $mol; | |
386 my $div; | |
387 my $len = $seq->length(); | |
388 | |
389 if ($seq->can('division') && defined $seq->division) { | |
390 $div = $seq->division(); | |
391 } | |
392 $div ||= 'UNK'; | |
393 | |
394 if ($seq->can('molecule')) { | |
395 $mol = $seq->molecule(); | |
396 $mol = 'RNA' if defined $mol && $mol =~ /RNA/; # no 'mRNA' | |
397 } | |
398 elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) { | |
399 my $alphabet =$seq->primary_seq->alphabet; | |
400 if ($alphabet eq 'dna') { | |
401 $mol ='DNA'; | |
402 } | |
403 elsif ($alphabet eq 'rna') { | |
404 $mol='RNA'; | |
405 } | |
406 elsif ($alphabet eq 'protein') { | |
407 $mol='AA'; | |
408 } | |
409 } | |
410 $mol ||= 'XXX'; | |
411 $mol = "circular $mol" if $seq->is_circular; | |
412 | |
413 my $temp_line; | |
414 if( $self->_id_generation_func ) { | |
415 $temp_line = &{$self->_id_generation_func}($seq); | |
416 } else { | |
417 $temp_line = sprintf("%-11sstandard; $mol; $div; %d BP.", $seq->id(), $len); | |
418 } | |
419 | |
420 $self->_print( "ID $temp_line\n","XX\n"); | |
421 | |
422 # Write the accession line if present | |
423 my( $acc ); | |
424 { | |
425 if( my $func = $self->_ac_generation_func ) { | |
426 $acc = &{$func}($seq); | |
427 } elsif( $seq->isa('Bio::Seq::RichSeqI') && | |
428 defined($seq->accession_number) ) { | |
429 $acc = $seq->accession_number; | |
430 $acc = join(";", $acc, $seq->get_secondary_accessions); | |
431 } elsif ( $seq->can('accession_number') ) { | |
432 $acc = $seq->accession_number; | |
433 } | |
434 | |
435 if (defined $acc) { | |
436 $self->_print("AC $acc;\n", | |
437 "XX\n"); | |
438 } | |
439 } | |
440 | |
441 # Write the sv line if present | |
442 { | |
443 my( $sv ); | |
444 if (my $func = $self->_sv_generation_func) { | |
445 $sv = &{$func}($seq); | |
446 } elsif($seq->isa('Bio::Seq::RichSeqI') && | |
447 defined($seq->seq_version)) { | |
448 $sv = "$acc.". $seq->seq_version(); | |
449 } | |
450 if (defined $sv) { | |
451 $self->_print( "SV $sv\n", | |
452 "XX\n"); | |
453 } | |
454 } | |
455 | |
456 # Date lines | |
457 my $switch=0; | |
458 if( $seq->can('get_dates') ) { | |
459 foreach my $dt ( $seq->get_dates() ) { | |
460 $self->_write_line_EMBL_regex("DT ","DT ",$dt,'\s+|$',80);#' | |
461 $switch=1; | |
462 } | |
463 if ($switch == 1) { | |
464 $self->_print("XX\n"); | |
465 } | |
466 } | |
467 | |
468 # Description lines | |
469 $self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+|$',80); #' | |
470 $self->_print( "XX\n"); | |
471 | |
472 # if there, write the kw line | |
473 { | |
474 my( $kw ); | |
475 if( my $func = $self->_kw_generation_func ) { | |
476 $kw = &{$func}($seq); | |
477 } elsif( $seq->can('keywords') ) { | |
478 $kw = $seq->keywords; | |
479 if( ref($kw) =~ /ARRAY/i ) { | |
480 $kw = join("; ", @$kw); | |
481 } | |
482 $kw .= '.' if( defined $kw && $kw !~ /\.$/ ); | |
483 } | |
484 if (defined $kw) { | |
485 $self->_write_line_EMBL_regex("KW ", "KW ", | |
486 $kw, '\s+|$', 80); #' | |
487 $self->_print( "XX\n"); | |
488 | |
489 } | |
490 } | |
491 | |
492 # Organism lines | |
493 | |
494 if ($seq->can('species') && (my $spec = $seq->species)) { | |
495 my($species, @class) = $spec->classification(); | |
496 my $genus = $class[0]; | |
497 my $OS = "$genus $species"; | |
498 if (my $ssp = $spec->sub_species) { | |
499 $OS .= " $ssp"; | |
500 } | |
501 if (my $common = $spec->common_name) { | |
502 $OS .= " ($common)"; | |
503 } | |
504 $self->_print("OS $OS\n"); | |
505 my $OC = join('; ', reverse(@class)) .'.'; | |
506 $self->_write_line_EMBL_regex("OC ","OC ",$OC,'; |$',80); #' | |
507 if ($spec->organelle) { | |
508 $self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80); #' | |
509 } | |
510 $self->_print("XX\n"); | |
511 } | |
512 | |
513 # Reference lines | |
514 my $t = 1; | |
515 if ( $seq->can('annotation') && defined $seq->annotation ) { | |
516 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { | |
517 $self->_print( "RN [$t]\n"); | |
518 | |
519 # Having no RP line is legal, but we need both | |
520 # start and end for a valid location. | |
521 my $start = $ref->start; | |
522 my $end = $ref->end; | |
523 if ($start and $end) { | |
524 $self->_print( "RP $start-$end\n"); | |
525 } elsif ($start or $end) { | |
526 $self->throw("Both start and end are needed for a valid RP line. Got: start='$start' end='$end'"); | |
527 } | |
528 | |
529 if (my $med = $ref->medline) { | |
530 $self->_print( "RX MEDLINE; $med.\n"); | |
531 } | |
532 if (my $pm = $ref->pubmed) { | |
533 $self->_print( "RX PUBMED; $pm.\n"); | |
534 } | |
535 $self->_write_line_EMBL_regex("RA ", "RA ", | |
536 $ref->authors . ";", | |
537 '\s+|$', 80); #' | |
538 | |
539 # If there is no title to the reference, it appears | |
540 # as a single semi-colon. All titles must end in | |
541 # a semi-colon. | |
542 my $ref_title = $ref->title || ''; | |
543 $ref_title =~ s/[\s;]*$/;/; | |
544 $self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+|$', 80); #' | |
545 $self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+|$', 80); #' | |
546 if ($ref->comment) { | |
547 $self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+|$', 80); #' | |
548 } | |
549 $self->_print("XX\n"); | |
550 $t++; | |
551 } | |
552 | |
553 | |
554 # DB Xref lines | |
555 if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) { | |
556 foreach my $dr (@db_xref) { | |
557 my $db_name = $dr->database; | |
558 my $prim = $dr->primary_id; | |
559 my $opt = $dr->optional_id || ''; | |
560 | |
561 my $line = "$db_name; $prim; $opt."; | |
562 $self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+|$', 80); #' | |
563 } | |
564 $self->_print("XX\n"); | |
565 } | |
566 | |
567 # Comment lines | |
568 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { | |
569 $self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+|$', 80); #' | |
570 $self->_print("XX\n"); | |
571 } | |
572 } | |
573 # "\\s\+\|\$" | |
574 | |
575 ## FEATURE TABLE | |
576 | |
577 $self->_print("FH Key Location/Qualifiers\n"); | |
578 $self->_print("FH\n"); | |
579 | |
580 my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : (); | |
581 if ($feats[0]) { | |
582 if( defined $self->_post_sort ) { | |
583 # we need to read things into an array. | |
584 # Process. Sort them. Print 'em | |
585 | |
586 my $post_sort_func = $self->_post_sort(); | |
587 my @fth; | |
588 | |
589 foreach my $sf ( @feats ) { | |
590 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq)); | |
591 } | |
592 | |
593 @fth = sort { &$post_sort_func($a,$b) } @fth; | |
594 | |
595 foreach my $fth ( @fth ) { | |
596 $self->_print_EMBL_FTHelper($fth); | |
597 } | |
598 } else { | |
599 # not post sorted. And so we can print as we get them. | |
600 # lower memory load... | |
601 | |
602 foreach my $sf ( @feats ) { | |
603 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq); | |
604 foreach my $fth ( @fth ) { | |
605 if( $fth->key eq 'CONTIG') { | |
606 $self->_show_dna(0); | |
607 } | |
608 $self->_print_EMBL_FTHelper($fth); | |
609 } | |
610 } | |
611 } | |
612 } | |
613 | |
614 if( $self->_show_dna() == 0 ) { | |
615 $self->_print( "//\n"); | |
616 return; | |
617 } | |
618 $self->_print( "XX\n"); | |
619 | |
620 # finished printing features. | |
621 | |
622 $str =~ tr/A-Z/a-z/; | |
623 | |
624 # Count each nucleotide | |
625 my $alen = $str =~ tr/a/a/; | |
626 my $clen = $str =~ tr/c/c/; | |
627 my $glen = $str =~ tr/g/g/; | |
628 my $tlen = $str =~ tr/t/t/; | |
629 | |
630 my $olen = $len - ($alen + $tlen + $clen + $glen); | |
631 if( $olen < 0 ) { | |
632 $self->warn("Weird. More atgc than bases. Problem!"); | |
633 } | |
634 | |
635 $self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n"); | |
636 | |
637 my $nuc = 60; # Number of nucleotides per line | |
638 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line | |
639 my $out_pat = 'A11' x 6; # Pattern for packing a line | |
640 my $length = length($str); | |
641 | |
642 # Calculate the number of nucleotides which fit on whole lines | |
643 my $whole = int($length / $nuc) * $nuc; | |
644 | |
645 # Print the whole lines | |
646 my( $i ); | |
647 for ($i = 0; $i < $whole; $i += $nuc) { | |
648 my $blocks = pack $out_pat, | |
649 unpack $whole_pat, | |
650 substr($str, $i, $nuc); | |
651 $self->_print(sprintf(" $blocks%9d\n", $i + $nuc)); | |
652 } | |
653 | |
654 # Print the last line | |
655 if (my $last = substr($str, $i)) { | |
656 my $last_len = length($last); | |
657 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10; | |
658 my $blocks = pack $out_pat, | |
659 unpack($last_pat, $last); | |
660 $self->_print(sprintf(" $blocks%9d\n", $length)); # Add the length to the end | |
661 } | |
662 | |
663 $self->_print( "//\n"); | |
664 | |
665 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
666 return 1; | |
667 } | |
668 } | |
669 | |
670 =head2 _print_EMBL_FTHelper | |
671 | |
672 Title : _print_EMBL_FTHelper | |
673 Usage : | |
674 Function: Internal function | |
675 Returns : | |
676 Args : | |
677 | |
678 | |
679 =cut | |
680 | |
681 sub _print_EMBL_FTHelper { | |
682 my ($self,$fth,$always_quote) = @_; | |
683 $always_quote ||= 0; | |
684 | |
685 if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) { | |
686 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!"); | |
687 } | |
688 | |
689 | |
690 #$self->_print( "FH Key Location/Qualifiers\n"); | |
691 #$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc)); | |
692 # let | |
693 if( $fth->key eq 'CONTIG' ) { | |
694 $self->_print("XX\n"); | |
695 $self->_write_line_EMBL_regex("CO ", | |
696 "CO ",$fth->loc, | |
697 '\,|$',80); #' | |
698 return; | |
699 } | |
700 $self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key), | |
701 "FT ",$fth->loc, | |
702 '\,|$',80); #' | |
703 foreach my $tag ( keys %{$fth->field} ) { | |
704 if( ! defined $fth->field->{$tag} ) { next; } | |
705 foreach my $value ( @{$fth->field->{$tag}} ) { | |
706 $value =~ s/\"/\"\"/g; | |
707 if ($value eq "_no_value") { | |
708 $self->_write_line_EMBL_regex("FT ", | |
709 "FT ", | |
710 "/$tag",'.|$',80); #' | |
711 } | |
712 elsif( $always_quote == 1 || $value !~ /^\d+$/ ) { | |
713 my $pat = $value =~ /\s/ ? '\s|$' : '.|$'; | |
714 $self->_write_line_EMBL_regex("FT ", | |
715 "FT ", | |
716 "/$tag=\"$value\"",$pat,80); | |
717 } | |
718 else { | |
719 $self->_write_line_EMBL_regex("FT ", | |
720 "FT ", | |
721 "/$tag=$value",'.|$',80); #' | |
722 } | |
723 # $self->_print( "FT /", $tag, "=\"", $value, "\"\n"); | |
724 } | |
725 } | |
726 } | |
727 | |
728 #' | |
729 =head2 _read_EMBL_References | |
730 | |
731 Title : _read_EMBL_References | |
732 Usage : | |
733 Function: Reads references from EMBL format. Internal function really | |
734 Example : | |
735 Returns : | |
736 Args : | |
737 | |
738 | |
739 =cut | |
740 | |
741 sub _read_EMBL_References { | |
742 my ($self,$buffer) = @_; | |
743 my (@refs); | |
744 | |
745 # assumme things are starting with RN | |
746 | |
747 if( $$buffer !~ /^RN/ ) { | |
748 warn("Not parsing line '$$buffer' which maybe important"); | |
749 } | |
750 my $b1; | |
751 my $b2; | |
752 my $title; | |
753 my $loc; | |
754 my $au; | |
755 my $med; | |
756 my $pm; | |
757 my $com; | |
758 | |
759 while( defined ($_ = $self->_readline) ) { | |
760 /^R/ || last; | |
761 /^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;}; | |
762 /^RX MEDLINE;\s+(\d+)/ && do {$med=$1}; | |
763 /^RX PUBMED;\s+(\d+)/ && do {$pm=$1}; | |
764 /^RA (.*)/ && do { | |
765 $au = $self->_concatenate_lines($au,$1); next; | |
766 }; | |
767 /^RT (.*)/ && do { | |
768 $title = $self->_concatenate_lines($title,$1); next; | |
769 }; | |
770 /^RL (.*)/ && do { | |
771 $loc = $self->_concatenate_lines($loc,$1); next; | |
772 }; | |
773 /^RC (.*)/ && do { | |
774 $com = $self->_concatenate_lines($com,$1); next; | |
775 }; | |
776 } | |
777 | |
778 my $ref = new Bio::Annotation::Reference; | |
779 $au =~ s/;\s*$//g; | |
780 $title =~ s/;\s*$//g; | |
781 | |
782 $ref->start($b1); | |
783 $ref->end($b2); | |
784 $ref->authors($au); | |
785 $ref->title($title); | |
786 $ref->location($loc); | |
787 $ref->medline($med); | |
788 $ref->comment($com); | |
789 $ref->pubmed($pm); | |
790 | |
791 push(@refs,$ref); | |
792 $$buffer = $_; | |
793 | |
794 return @refs; | |
795 } | |
796 | |
797 =head2 _read_EMBL_Species | |
798 | |
799 Title : _read_EMBL_Species | |
800 Usage : | |
801 Function: Reads the EMBL Organism species and classification | |
802 lines. | |
803 Example : | |
804 Returns : A Bio::Species object | |
805 Args : a reference to the current line buffer | |
806 | |
807 =cut | |
808 | |
809 sub _read_EMBL_Species { | |
810 my( $self, $buffer ) = @_; | |
811 my $org; | |
812 | |
813 $_ = $$buffer; | |
814 my( $sub_species, $species, $genus, $common, @class ); | |
815 while (defined( $_ ||= $self->_readline )) { | |
816 | |
817 if (/^OS\s+(\S+)(?:\s+([^\(]\S*))?(?:\s+([^\(]\S*))?(?:\s+\((.*)\))?/) { | |
818 $genus = $1; | |
819 $species = $2 || 'sp.'; | |
820 $sub_species = $3 if $3; | |
821 $common = $4 if $4; | |
822 } | |
823 elsif (s/^OC\s+//) { | |
824 # only split on ';' or '.' so that | |
825 # classification that is 2 words will | |
826 # still get matched | |
827 # use map to remove trailing/leading spaces | |
828 chomp; | |
829 push(@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/); | |
830 } | |
831 elsif (/^OG\s+(.*)/) { | |
832 $org = $1; | |
833 } | |
834 else { | |
835 last; | |
836 } | |
837 | |
838 $_ = undef; # Empty $_ to trigger read of next line | |
839 } | |
840 | |
841 $$buffer = $_; | |
842 | |
843 # Don't make a species object if it is "Unknown" or "None" | |
844 return if $genus =~ /^(Unknown|None)$/i; | |
845 | |
846 # Bio::Species array needs array in Species -> Kingdom direction | |
847 if ($class[$#class] eq $genus) { | |
848 push( @class, $species ); | |
849 } else { | |
850 push( @class, $genus, $species ); | |
851 } | |
852 @class = reverse @class; | |
853 | |
854 my $make = Bio::Species->new(); | |
855 $make->classification( \@class, "FORCE" ); # no name validation please | |
856 $make->common_name( $common ) if $common; | |
857 $make->sub_species( $sub_species ) if $sub_species; | |
858 $make->organelle ( $org ) if $org; | |
859 return $make; | |
860 } | |
861 | |
862 =head2 _read_EMBL_DBLink | |
863 | |
864 Title : _read_EMBL_DBLink | |
865 Usage : | |
866 Function: Reads the EMBL database cross reference ("DR") lines | |
867 Example : | |
868 Returns : A list of Bio::Annotation::DBLink objects | |
869 Args : | |
870 | |
871 =cut | |
872 | |
873 sub _read_EMBL_DBLink { | |
874 my( $self,$buffer ) = @_; | |
875 my( @db_link ); | |
876 | |
877 $_ = $$buffer; | |
878 while (defined( $_ ||= $self->_readline )) { | |
879 | |
880 if (my($databse, $prim_id, $sec_id) | |
881 = /^DR ([^\s;]+);\s*([^\s;]+);\s*([^\s;]+)?\.$/) { | |
882 my $link = Bio::Annotation::DBLink->new(); | |
883 $link->database ( $databse ); | |
884 $link->primary_id ( $prim_id ); | |
885 $link->optional_id( $sec_id ) if $sec_id; | |
886 push(@db_link, $link); | |
887 } | |
888 else { | |
889 last; | |
890 } | |
891 | |
892 $_ = undef; # Empty $_ to trigger read of next line | |
893 } | |
894 | |
895 $$buffer = $_; | |
896 | |
897 return @db_link; | |
898 } | |
899 | |
900 =head2 _filehandle | |
901 | |
902 Title : _filehandle | |
903 Usage : $obj->_filehandle($newval) | |
904 Function: | |
905 Example : | |
906 Returns : value of _filehandle | |
907 Args : newvalue (optional) | |
908 | |
909 | |
910 =cut | |
911 | |
912 sub _filehandle{ | |
913 my ($obj,$value) = @_; | |
914 if( defined $value) { | |
915 $obj->{'_filehandle'} = $value; | |
916 } | |
917 return $obj->{'_filehandle'}; | |
918 | |
919 } | |
920 | |
921 =head2 _read_FTHelper_EMBL | |
922 | |
923 Title : _read_FTHelper_EMBL | |
924 Usage : _read_FTHelper_EMBL($buffer) | |
925 Function: reads the next FT key line | |
926 Example : | |
927 Returns : Bio::SeqIO::FTHelper object | |
928 Args : filehandle and reference to a scalar | |
929 | |
930 | |
931 =cut | |
932 | |
933 sub _read_FTHelper_EMBL { | |
934 my ($self,$buffer) = @_; | |
935 | |
936 my ($key, # The key of the feature | |
937 $loc, # The location line from the feature | |
938 @qual, # An arrray of lines making up the qualifiers | |
939 ); | |
940 | |
941 if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/) { | |
942 $key = $1; | |
943 $loc = $2; | |
944 # Read all the lines up to the next feature | |
945 while ( defined($_ = $self->_readline) ) { | |
946 if (/^FT(\s+)(.+?)\s*$/) { | |
947 # Lines inside features are preceeded by 19 spaces | |
948 # A new feature is preceeded by 3 spaces | |
949 if (length($1) > 4) { | |
950 # Add to qualifiers if we're in the qualifiers | |
951 if (@qual) { | |
952 push(@qual, $2); | |
953 } | |
954 # Start the qualifier list if it's the first qualifier | |
955 elsif (substr($2, 0, 1) eq '/') { | |
956 @qual = ($2); | |
957 } | |
958 # We're still in the location line, so append to location | |
959 else { | |
960 $loc .= $2; | |
961 } | |
962 } else { | |
963 # We've reached the start of the next feature | |
964 last; | |
965 } | |
966 } else { | |
967 # We're at the end of the feature table | |
968 last; | |
969 } | |
970 } | |
971 } elsif( $$buffer =~ /^CO\s+(\S+)/) { | |
972 $key = 'CONTIG'; | |
973 $loc = $1; | |
974 # Read all the lines up to the next feature | |
975 while ( defined($_ = $self->_readline) ) { | |
976 if (/^CO\s+(\S+)\s*$/) { | |
977 $loc .= $1; | |
978 } else { | |
979 # We've reached the start of the next feature | |
980 last; | |
981 } | |
982 } | |
983 } else { | |
984 # No feature key | |
985 return; | |
986 } | |
987 | |
988 # Put the first line of the next feature into the buffer | |
989 $$buffer = $_; | |
990 | |
991 # Make the new FTHelper object | |
992 my $out = new Bio::SeqIO::FTHelper(); | |
993 $out->verbose($self->verbose()); | |
994 $out->key($key); | |
995 $out->loc($loc); | |
996 | |
997 # Now parse and add any qualifiers. (@qual is kept | |
998 # intact to provide informative error messages.) | |
999 QUAL: for (my $i = 0; $i < @qual; $i++) { | |
1000 $_ = $qual[$i]; | |
1001 my( $qualifier, $value ) = m{^/([^=]+)(?:=(.+))?} | |
1002 or $self->throw("Can't see new qualifier in: $_\nfrom:\n" | |
1003 . join('', map "$_\n", @qual)); | |
1004 if (defined $value) { | |
1005 # Do we have a quoted value? | |
1006 if (substr($value, 0, 1) eq '"') { | |
1007 # Keep adding to value until we find the trailing quote | |
1008 # and the quotes are balanced | |
1009 while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #" | |
1010 $i++; | |
1011 my $next = $qual[$i]; | |
1012 unless (defined($next)) { | |
1013 warn("Unbalanced quote in:\n", map("$_\n", @qual), | |
1014 "No further qualifiers will be added for this feature"); | |
1015 last QUAL; | |
1016 } | |
1017 | |
1018 # Join to value with space if value or next line contains a space | |
1019 $value .= (grep /\s/, ($value, $next)) ? " $next" : $next; | |
1020 } | |
1021 # Trim leading and trailing quotes | |
1022 $value =~ s/^"|"$//g; | |
1023 # Undouble internal quotes | |
1024 $value =~ s/""/"/g; #" | |
1025 } | |
1026 } else { | |
1027 $value = '_no_value'; | |
1028 } | |
1029 | |
1030 # Store the qualifier | |
1031 $out->field->{$qualifier} ||= []; | |
1032 push(@{$out->field->{$qualifier}},$value); | |
1033 } | |
1034 | |
1035 return $out; | |
1036 } | |
1037 | |
1038 =head2 _write_line_EMBL | |
1039 | |
1040 Title : _write_line_EMBL | |
1041 Usage : | |
1042 Function: internal function | |
1043 Example : | |
1044 Returns : | |
1045 Args : | |
1046 | |
1047 | |
1048 =cut | |
1049 | |
1050 sub _write_line_EMBL { | |
1051 my ($self,$pre1,$pre2,$line,$length) = @_; | |
1052 | |
1053 $length || die "Miscalled write_line_EMBL without length. Programming error!"; | |
1054 my $subl = $length - length $pre2; | |
1055 my $linel = length $line; | |
1056 my $i; | |
1057 | |
1058 my $sub = substr($line,0,$length - length $pre1); | |
1059 | |
1060 $self->_print( "$pre1$sub\n"); | |
1061 | |
1062 for($i= ($length - length $pre1);$i < $linel;) { | |
1063 $sub = substr($line,$i,($subl)); | |
1064 $self->_print( "$pre2$sub\n"); | |
1065 $i += $subl; | |
1066 } | |
1067 | |
1068 } | |
1069 | |
1070 =head2 _write_line_EMBL_regex | |
1071 | |
1072 Title : _write_line_EMBL_regex | |
1073 Usage : | |
1074 Function: internal function for writing lines of specified | |
1075 length, with different first and the next line | |
1076 left hand headers and split at specific points in the | |
1077 text | |
1078 Example : | |
1079 Returns : nothing | |
1080 Args : file handle, first header, second header, text-line, regex for line breaks, total line length | |
1081 | |
1082 | |
1083 =cut | |
1084 | |
1085 sub _write_line_EMBL_regex { | |
1086 my ($self,$pre1,$pre2,$line,$regex,$length) = @_; | |
1087 | |
1088 #print STDOUT "Going to print with $line!\n"; | |
1089 | |
1090 $length || die "Programming error - called write_line_EMBL_regex without length."; | |
1091 | |
1092 my $subl = $length - (length $pre1) -1 ; | |
1093 | |
1094 | |
1095 | |
1096 my( @lines ); | |
1097 while(defined $line && | |
1098 $line =~ m/(.{1,$subl})($regex)/g) { | |
1099 push(@lines, $1.$2); | |
1100 } | |
1101 foreach (@lines) { s/\s+$//; } | |
1102 | |
1103 # Print first line | |
1104 my $s = shift(@lines) || ''; | |
1105 $self->_print( "$pre1$s\n"); | |
1106 | |
1107 # Print the rest | |
1108 foreach my $s ( @lines ) { | |
1109 $s = '' if( !defined $s ); | |
1110 $self->_print( "$pre2$s\n"); | |
1111 } | |
1112 } | |
1113 | |
1114 =head2 _post_sort | |
1115 | |
1116 Title : _post_sort | |
1117 Usage : $obj->_post_sort($newval) | |
1118 Function: | |
1119 Returns : value of _post_sort | |
1120 Args : newvalue (optional) | |
1121 | |
1122 | |
1123 =cut | |
1124 | |
1125 sub _post_sort{ | |
1126 my $obj = shift; | |
1127 if( @_ ) { | |
1128 my $value = shift; | |
1129 $obj->{'_post_sort'} = $value; | |
1130 } | |
1131 return $obj->{'_post_sort'}; | |
1132 | |
1133 } | |
1134 | |
1135 =head2 _show_dna | |
1136 | |
1137 Title : _show_dna | |
1138 Usage : $obj->_show_dna($newval) | |
1139 Function: | |
1140 Returns : value of _show_dna | |
1141 Args : newvalue (optional) | |
1142 | |
1143 | |
1144 =cut | |
1145 | |
1146 sub _show_dna{ | |
1147 my $obj = shift; | |
1148 if( @_ ) { | |
1149 my $value = shift; | |
1150 $obj->{'_show_dna'} = $value; | |
1151 } | |
1152 return $obj->{'_show_dna'}; | |
1153 | |
1154 } | |
1155 | |
1156 =head2 _id_generation_func | |
1157 | |
1158 Title : _id_generation_func | |
1159 Usage : $obj->_id_generation_func($newval) | |
1160 Function: | |
1161 Returns : value of _id_generation_func | |
1162 Args : newvalue (optional) | |
1163 | |
1164 | |
1165 =cut | |
1166 | |
1167 sub _id_generation_func{ | |
1168 my $obj = shift; | |
1169 if( @_ ) { | |
1170 my $value = shift; | |
1171 $obj->{'_id_generation_func'} = $value; | |
1172 } | |
1173 return $obj->{'_id_generation_func'}; | |
1174 | |
1175 } | |
1176 | |
1177 =head2 _ac_generation_func | |
1178 | |
1179 Title : _ac_generation_func | |
1180 Usage : $obj->_ac_generation_func($newval) | |
1181 Function: | |
1182 Returns : value of _ac_generation_func | |
1183 Args : newvalue (optional) | |
1184 | |
1185 | |
1186 =cut | |
1187 | |
1188 sub _ac_generation_func{ | |
1189 my $obj = shift; | |
1190 if( @_ ) { | |
1191 my $value = shift; | |
1192 $obj->{'_ac_generation_func'} = $value; | |
1193 } | |
1194 return $obj->{'_ac_generation_func'}; | |
1195 | |
1196 } | |
1197 | |
1198 =head2 _sv_generation_func | |
1199 | |
1200 Title : _sv_generation_func | |
1201 Usage : $obj->_sv_generation_func($newval) | |
1202 Function: | |
1203 Returns : value of _sv_generation_func | |
1204 Args : newvalue (optional) | |
1205 | |
1206 | |
1207 =cut | |
1208 | |
1209 sub _sv_generation_func{ | |
1210 my $obj = shift; | |
1211 if( @_ ) { | |
1212 my $value = shift; | |
1213 $obj->{'_sv_generation_func'} = $value; | |
1214 } | |
1215 return $obj->{'_sv_generation_func'}; | |
1216 | |
1217 } | |
1218 | |
1219 =head2 _kw_generation_func | |
1220 | |
1221 Title : _kw_generation_func | |
1222 Usage : $obj->_kw_generation_func($newval) | |
1223 Function: | |
1224 Returns : value of _kw_generation_func | |
1225 Args : newvalue (optional) | |
1226 | |
1227 | |
1228 =cut | |
1229 | |
1230 sub _kw_generation_func{ | |
1231 my $obj = shift; | |
1232 if( @_ ) { | |
1233 my $value = shift; | |
1234 $obj->{'_kw_generation_func'} = $value; | |
1235 } | |
1236 return $obj->{'_kw_generation_func'}; | |
1237 | |
1238 } | |
1239 | |
1240 1; |