comparison variant_effect_predictor/Bio/SeqIO.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1
2 # $Id: SeqIO.pm,v 1.59.2.4 2003/09/14 19:16:53 jason Exp $
3 #
4 # BioPerl module for Bio::SeqIO
5 #
6 # Cared for by Ewan Birney <birney@sanger.ac.uk>
7 # and Lincoln Stein <lstein@cshl.org>
8 #
9 # Copyright Ewan Birney
10 #
11 # You may distribute this module under the same terms as perl itself
12 #
13 # _history
14 # October 18, 1999 Largely rewritten by Lincoln Stein
15
16 # POD documentation - main docs before the code
17
18 =head1 NAME
19
20 Bio::SeqIO - Handler for SeqIO Formats
21
22 =head1 SYNOPSIS
23
24 use Bio::SeqIO;
25
26 $in = Bio::SeqIO->new(-file => "inputfilename" , '-format' => 'Fasta');
27 $out = Bio::SeqIO->new(-file => ">outputfilename" , '-format' => 'EMBL');
28 # note: we quote -format to keep older Perls from complaining.
29
30 while ( my $seq = $in->next_seq() ) {
31 $out->write_seq($seq);
32 }
33
34 Now, to actually get at the sequence object, use the standard Bio::Seq
35 methods (look at L<Bio::Seq> if you don't know what they are)
36
37 use Bio::SeqIO;
38
39 $in = Bio::SeqIO->new(-file => "inputfilename" , '-format' => 'genbank');
40
41 while ( my $seq = $in->next_seq() ) {
42 print "Sequence ",$seq->id," first 10 bases ",$seq->subseq(1,10),"\n";
43 }
44
45
46 The SeqIO system does have a filehandle binding. Most people find this
47 a little confusing, but it does mean you write the world's smallest
48 reformatter
49
50 use Bio::SeqIO;
51
52 $in = Bio::SeqIO->newFh(-file => "inputfilename" , '-format' => 'Fasta');
53 $out = Bio::SeqIO->newFh('-format' => 'EMBL');
54
55 # World's shortest Fasta<->EMBL format converter:
56 print $out $_ while <$in>;
57
58
59 =head1 DESCRIPTION
60
61 Bio::SeqIO is a handler module for the formats in the SeqIO set (eg,
62 Bio::SeqIO::fasta). It is the officially sanctioned way of getting at
63 the format objects, which most people should use.
64
65 The Bio::SeqIO system can be thought of like biological file handles.
66 They are attached to filehandles with smart formatting rules (eg,
67 genbank format, or EMBL format, or binary trace file format) and
68 can either read or write sequence objects (Bio::Seq objects, or
69 more correctly, Bio::SeqI implementing objects, of which Bio::Seq is
70 one such object). If you want to know what to do with a Bio::Seq
71 object, read L<Bio::Seq>.
72
73 The idea is that you request a stream object for a particular format.
74 All the stream objects have a notion of an internal file that is read
75 from or written to. A particular SeqIO object instance is configured
76 for either input or output. A specific example of a stream object is
77 the Bio::SeqIO::fasta object.
78
79 Each stream object has functions
80
81 $stream->next_seq();
82
83 and
84
85 $stream->write_seq($seq);
86
87 As an added bonus, you can recover a filehandle that is tied to the
88 SeqIO object, allowing you to use the standard E<lt>E<gt> and print operations
89 to read and write sequence objects:
90
91 use Bio::SeqIO;
92
93 $stream = Bio::SeqIO->newFh(-format => 'Fasta'); # read from standard input
94
95 while ( $seq = <$stream> ) {
96 # do something with $seq
97 }
98
99 and
100
101 print $stream $seq; # when stream is in output mode
102
103 This makes the simplest ever reformatter
104
105 #!/usr/local/bin/perl
106
107 $format1 = shift;
108 $format2 = shift || die "Usage: reformat format1 format2 < input > output";
109
110 use Bio::SeqIO;
111
112 $in = Bio::SeqIO->newFh(-format => $format1 );
113 $out = Bio::SeqIO->newFh(-format => $format2 );
114 #note: you might want to quote -format to keep older perl's from complaining.
115
116 print $out $_ while <$in>;
117
118
119 =head1 CONSTRUCTORS
120
121 =head2 Bio::SeqIO-E<gt>new()
122
123 $seqIO = Bio::SeqIO->new(-file => 'filename', -format=>$format);
124 $seqIO = Bio::SeqIO->new(-fh => \*FILEHANDLE, -format=>$format);
125 $seqIO = Bio::SeqIO->new(-format => $format);
126
127 The new() class method constructs a new Bio::SeqIO object. The
128 returned object can be used to retrieve or print Seq objects. new()
129 accepts the following parameters:
130
131 =over 4
132
133 =item -file
134
135 A file path to be opened for reading or writing. The usual Perl
136 conventions apply:
137
138 'file' # open file for reading
139 '>file' # open file for writing
140 '>>file' # open file for appending
141 '+<file' # open file read/write
142 'command |' # open a pipe from the command
143 '| command' # open a pipe to the command
144
145 =item -fh
146
147 You may provide new() with a previously-opened filehandle. For
148 example, to read from STDIN:
149
150 $seqIO = Bio::SeqIO->new(-fh => \*STDIN);
151
152 Note that you must pass filehandles as references to globs.
153
154 If neither a filehandle nor a filename is specified, then the module
155 will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt>
156 semantics.
157
158 A string filehandle is handy if you want to modify the output in the
159 memory, before printing it out. The following program reads in EMBL
160 formatted entries from a file and prints them out in fasta format with
161 some HTML tags:
162
163 use Bio::SeqIO;
164 use IO::String;
165 my $in = Bio::SeqIO->new('-file' => "emblfile" ,
166 '-format' => 'EMBL');
167 while ( my $seq = $in->next_seq() ) {
168 # the output handle is reset for every file
169 my $stringio = IO::String->new($string);
170 my $out = Bio::SeqIO->new('-fh' => $stringio,
171 '-format' => 'fasta');
172 # output goes into $string
173 $out->write_seq($seq);
174 # modify $string
175 $string =~ s|(>)(\w+)|$1<font color="Red">$2</font>|g;
176 # print into STDOUT
177 print $string;
178 }
179
180 =item -format
181
182 Specify the format of the file. Supported formats include:
183
184 Fasta FASTA format
185 EMBL EMBL format
186 GenBank GenBank format
187 swiss Swissprot format
188 PIR Protein Information Resource format
189 GCG GCG format
190 raw Raw format (one sequence per line, no ID)
191 ace ACeDB sequence format
192 game GAME XML format
193 phd phred output
194 qual Quality values (get a sequence of quality scores)
195 Fastq Fastq format
196 SCF SCF tracefile format
197 ABI ABI tracefile format
198 ALF ALF tracefile format
199 CTF CTF tracefile format
200 ZTR ZTR tracefile format
201 PLN Staden plain tracefile format
202 EXP Staden tagged experiment tracefile format
203
204 If no format is specified and a filename is given then the module
205 will attempt to deduce the format from the filename suffix. If this
206 is unsuccessful then Fasta format is assumed.
207
208 The format name is case insensitive. 'FASTA', 'Fasta' and 'fasta' are
209 all valid suffixes.
210
211 Currently, the tracefile formats (except for SCF) require installation
212 of the external Staden "io_lib" package, as well as the
213 Bio::SeqIO::staden::read package available from the bioperl-ext
214 repository.
215
216 =item -flush
217
218 By default, all files (or filehandles) opened for writing sequences
219 will be flushed after each write_seq() (making the file immediately
220 usable). If you don't need this facility and would like to marginally
221 improve the efficiency of writing multiple sequences to the same file
222 (or filehandle), pass the -flush option '0' or any other value that
223 evaluates as defined but false:
224
225 my $gb = new Bio::SeqIO -file => "<gball.gbk",
226 -format => "gb";
227 my $fa = new Bio::SeqIO -file => ">gball.fa",
228 -format => "fasta",
229 -flush => 0; # go as fast as we can!
230 while($seq = $gb->next_seq) { $fa->write_seq($seq) }
231
232
233 =back
234
235 =head2 Bio::SeqIO-E<gt>newFh()
236
237 $fh = Bio::SeqIO->newFh(-fh => \*FILEHANDLE, -format=>$format);
238 $fh = Bio::SeqIO->newFh(-format => $format);
239 # etc.
240
241 This constructor behaves like new(), but returns a tied filehandle
242 rather than a Bio::SeqIO object. You can read sequences from this
243 object using the familiar E<lt>E<gt> operator, and write to it using
244 print(). The usual array and $_ semantics work. For example, you can
245 read all sequence objects into an array like this:
246
247 @sequences = <$fh>;
248
249 Other operations, such as read(), sysread(), write(), close(), and printf()
250 are not supported.
251
252 =head1 OBJECT METHODS
253
254 See below for more detailed summaries. The main methods are:
255
256 =head2 $sequence = $seqIO-E<gt>next_seq()
257
258 Fetch the next sequence from the stream.
259
260 =head2 $seqIO-E<gt>write_seq($sequence [,$another_sequence,...])
261
262 Write the specified sequence(s) to the stream.
263
264 =head2 TIEHANDLE(), READLINE(), PRINT()
265
266 These provide the tie interface. See L<perltie> for more details.
267
268 =head1 FEEDBACK
269
270 =head2 Mailing Lists
271
272 User feedback is an integral part of the evolution of this
273 and other Bioperl modules. Send your comments and suggestions preferably
274 to one of the Bioperl mailing lists.
275
276 Your participation is much appreciated.
277
278 bioperl-l@bioperl.org - General discussion
279 http://bioperl.org/MailList.shtml - About the mailing lists
280
281 =head2 Reporting Bugs
282
283 Report bugs to the Bioperl bug tracking system to help us keep track
284 the bugs and their resolution.
285 Bug reports can be submitted via email or the web:
286
287 bioperl-bugs@bioperl.org
288 http://bugzilla.bioperl.org/
289
290 =head1 AUTHOR - Ewan Birney, Lincoln Stein
291
292 Email birney@ebi.ac.uk
293
294 =head1 APPENDIX
295
296 The rest of the documentation details each of the object
297 methods. Internal methods are usually preceded with a _
298
299 =cut
300
301 #' Let the code begin...
302
303 package Bio::SeqIO;
304
305 use strict;
306 use vars qw(@ISA);
307
308 use Bio::Root::Root;
309 use Bio::Root::IO;
310 use Bio::Factory::SequenceStreamI;
311 use Bio::Factory::FTLocationFactory;
312 use Bio::Seq::SeqBuilder;
313 use Symbol();
314
315 @ISA = qw(Bio::Root::Root Bio::Root::IO Bio::Factory::SequenceStreamI);
316
317 sub BEGIN {
318 eval { require Bio::SeqIO::staden::read; };
319 }
320
321 my %valid_alphabet_cache;
322
323 =head2 new
324
325 Title : new
326 Usage : $stream = Bio::SeqIO->new(-file => $filename, -format => 'Format')
327 Function: Returns a new seqstream
328 Returns : A Bio::SeqIO stream initialised with the appropriate format
329 Args : Named parameters:
330 -file => $filename
331 -fh => filehandle to attach to
332 -format => format
333
334 Additional arguments may be used to set factories and
335 builders involved in the sequence object creation. None of
336 these must be provided, they all have reasonable defaults.
337 -seqfactory the L<Bio::Factory::SequenceFactoryI> object
338 -locfactory the L<Bio::Factory::LocationFactoryI> object
339 -objbuilder the L<Bio::Factory::ObjectBuilderI> object
340
341 See L<Bio::SeqIO::Handler>
342
343 =cut
344
345 my $entry = 0;
346
347 sub new {
348 my ($caller,@args) = @_;
349 my $class = ref($caller) || $caller;
350
351 # or do we want to call SUPER on an object if $caller is an
352 # object?
353 if( $class =~ /Bio::SeqIO::(\S+)/ ) {
354 my ($self) = $class->SUPER::new(@args);
355 $self->_initialize(@args);
356 return $self;
357 } else {
358
359 my %param = @args;
360 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
361 my $format = $param{'-format'} ||
362 $class->_guess_format( $param{-file} || $ARGV[0] ) ||
363 'fasta';
364 $format = "\L$format"; # normalize capitalization to lower case
365
366 # normalize capitalization
367 return undef unless( $class->_load_format_module($format) );
368 return "Bio::SeqIO::$format"->new(@args);
369 }
370 }
371
372 =head2 newFh
373
374 Title : newFh
375 Usage : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format')
376 Function: does a new() followed by an fh()
377 Example : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format')
378 $sequence = <$fh>; # read a sequence object
379 print $fh $sequence; # write a sequence object
380 Returns : filehandle tied to the Bio::SeqIO::Fh class
381 Args :
382
383 See L<Bio::SeqIO::Fh>
384
385 =cut
386
387 sub newFh {
388 my $class = shift;
389 return unless my $self = $class->new(@_);
390 return $self->fh;
391 }
392
393 =head2 fh
394
395 Title : fh
396 Usage : $obj->fh
397 Function:
398 Example : $fh = $obj->fh; # make a tied filehandle
399 $sequence = <$fh>; # read a sequence object
400 print $fh $sequence; # write a sequence object
401 Returns : filehandle tied to Bio::SeqIO class
402 Args : none
403
404 =cut
405
406
407 sub fh {
408 my $self = shift;
409 my $class = ref($self) || $self;
410 my $s = Symbol::gensym;
411 tie $$s,$class,$self;
412 return $s;
413 }
414
415 # _initialize is chained for all SeqIO classes
416
417 sub _initialize {
418 my($self, @args) = @_;
419
420 # flush is initialized by the Root::IO init
421
422 my ($seqfact,$locfact,$objbuilder) =
423 $self->_rearrange([qw(SEQFACTORY
424 LOCFACTORY
425 OBJBUILDER)
426 ], @args);
427
428 $locfact = Bio::Factory::FTLocationFactory->new(-verbose => $self->verbose) if ! $locfact;
429 $objbuilder = Bio::Seq::SeqBuilder->new(-verbose => $self->verbose) unless $objbuilder;
430 $self->sequence_builder($objbuilder);
431 $self->location_factory($locfact);
432 # note that this should come last because it propagates the sequence
433 # factory to the sequence builder
434 $seqfact && $self->sequence_factory($seqfact);
435
436 # initialize the IO part
437 $self->_initialize_io(@args);
438 }
439
440 =head2 next_seq
441
442 Title : next_seq
443 Usage : $seq = stream->next_seq
444 Function: Reads the next sequence object from the stream and returns it.
445
446 Certain driver modules may encounter entries in the stream that
447 are either misformatted or that use syntax not yet understood
448 by the driver. If such an incident is recoverable, e.g., by
449 dismissing a feature of a feature table or some other non-mandatory
450 part of an entry, the driver will issue a warning. In the case
451 of a non-recoverable situation an exception will be thrown.
452 Do not assume that you can resume parsing the same stream after
453 catching the exception. Note that you can always turn recoverable
454 errors into exceptions by calling $stream->verbose(2).
455 Returns : a Bio::Seq sequence object
456 Args : none
457
458 See L<Bio::Root::RootI>, L<Bio::Factory::SeqStreamI>, L<Bio::Seq>
459
460 =cut
461
462 sub next_seq {
463 my ($self, $seq) = @_;
464 $self->throw("Sorry, you cannot read from a generic Bio::SeqIO object.");
465 }
466
467 =head2 write_seq
468
469 Title : write_seq
470 Usage : $stream->write_seq($seq)
471 Function: writes the $seq object into the stream
472 Returns : 1 for success and 0 for error
473 Args : Bio::Seq object
474
475 =cut
476
477 sub write_seq {
478 my ($self, $seq) = @_;
479 $self->throw("Sorry, you cannot write to a generic Bio::SeqIO object.");
480 }
481
482
483 =head2 alphabet
484
485 Title : alphabet
486 Usage : $self->alphabet($newval)
487 Function: Set/get the molecule type for the Seq objects to be created.
488 Example : $seqio->alphabet('protein')
489 Returns : value of alphabet: 'dna', 'rna', or 'protein'
490 Args : newvalue (optional)
491 Throws : Exception if the argument is not one of 'dna', 'rna', or 'protein'
492
493 =cut
494
495 sub alphabet {
496 my ($self, $value) = @_;
497
498 if ( defined $value) {
499 $value = lc $value;
500 unless ($valid_alphabet_cache{$value}) {
501 # instead of hard-coding the allowed values once more, we check by
502 # creating a dummy sequence object
503 eval {
504 require Bio::PrimarySeq;
505 my $seq = Bio::PrimarySeq->new('-verbose' => $self->verbose,
506 '-alphabet' => $value);
507
508 };
509 if ($@) {
510 $self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values.");
511 }
512 $valid_alphabet_cache{$value} = 1;
513 }
514 $self->{'alphabet'} = $value;
515 }
516 return $self->{'alphabet'};
517 }
518
519 =head2 _load_format_module
520
521 Title : _load_format_module
522 Usage : *INTERNAL SeqIO stuff*
523 Function: Loads up (like use) a module at run time on demand
524 Example :
525 Returns :
526 Args :
527
528 =cut
529
530 sub _load_format_module {
531 my ($self, $format) = @_;
532 my $module = "Bio::SeqIO::" . $format;
533 my $ok;
534
535 eval {
536 $ok = $self->_load_module($module);
537 };
538 if ( $@ ) {
539 print STDERR <<END;
540 $self: $format cannot be found
541 Exception $@
542 For more information about the SeqIO system please see the SeqIO docs.
543 This includes ways of checking for formats at compile time, not run time
544 END
545 ;
546 }
547 return $ok;
548 }
549
550 =head2 _concatenate_lines
551
552 Title : _concatenate_lines
553 Usage : $s = _concatenate_lines($line, $continuation_line)
554 Function: Private. Concatenates two strings assuming that the second stems
555 from a continuation line of the first. Adds a space between both
556 unless the first ends with a dash.
557
558 Takes care of either arg being empty.
559 Example :
560 Returns : A string.
561 Args :
562
563 =cut
564
565 sub _concatenate_lines {
566 my ($self, $s1, $s2) = @_;
567
568 $s1 .= " " if($s1 && ($s1 !~ /-$/) && $s2);
569 return ($s1 ? $s1 : "") . ($s2 ? $s2 : "");
570 }
571
572 =head2 _filehandle
573
574 Title : _filehandle
575 Usage : $obj->_filehandle($newval)
576 Function: This method is deprecated. Call _fh() instead.
577 Example :
578 Returns : value of _filehandle
579 Args : newvalue (optional)
580
581
582 =cut
583
584 sub _filehandle {
585 my ($self,@args) = @_;
586 return $self->_fh(@args);
587 }
588
589 =head2 _guess_format
590
591 Title : _guess_format
592 Usage : $obj->_guess_format($filename)
593 Function: guess format based on file suffix
594 Example :
595 Returns : guessed format of filename (lower case)
596 Args :
597 Notes : formats that _filehandle() will guess include fasta,
598 genbank, scf, pir, embl, raw, gcg, ace, bsml, swissprot,
599 fastq and phd/phred
600
601 =cut
602
603 sub _guess_format {
604 my $class = shift;
605 return unless $_ = shift;
606 return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i;
607 return 'genbank' if /\.(gb|gbank|genbank|gbk|gbs)$/i;
608 return 'scf' if /\.scf$/i;
609 return 'scf' if /\.scf$/i;
610 return 'abi' if /\.abi$/i;
611 return 'alf' if /\.alf$/i;
612 return 'ctf' if /\.ctf$/i;
613 return 'ztr' if /\.ztr$/i;
614 return 'pln' if /\.pln$/i;
615 return 'exp' if /\.exp$/i;
616 return 'pir' if /\.pir$/i;
617 return 'embl' if /\.(embl|ebl|emb|dat)$/i;
618 return 'raw' if /\.(txt)$/i;
619 return 'gcg' if /\.gcg$/i;
620 return 'ace' if /\.ace$/i;
621 return 'bsml' if /\.(bsm|bsml)$/i;
622 return 'swiss' if /\.(swiss|sp)$/i;
623 return 'phd' if /\.(phd|phred)$/i;
624 return 'fastq' if /\.fastq$/i;
625 }
626
627 sub DESTROY {
628 my $self = shift;
629
630 $self->close();
631 }
632
633 sub TIEHANDLE {
634 my ($class,$val) = @_;
635 return bless {'seqio' => $val}, $class;
636 }
637
638 sub READLINE {
639 my $self = shift;
640 return $self->{'seqio'}->next_seq() unless wantarray;
641 my (@list, $obj);
642 push @list, $obj while $obj = $self->{'seqio'}->next_seq();
643 return @list;
644 }
645
646 sub PRINT {
647 my $self = shift;
648 $self->{'seqio'}->write_seq(@_);
649 }
650
651 =head2 sequence_factory
652
653 Title : sequence_factory
654 Usage : $seqio->sequence_factory($seqfactory)
655 Function: Get/Set the Bio::Factory::SequenceFactoryI
656 Returns : Bio::Factory::SequenceFactoryI
657 Args : [optional] Bio::Factory::SequenceFactoryI
658
659
660 =cut
661
662 sub sequence_factory{
663 my ($self,$obj) = @_;
664 if( defined $obj ) {
665 if( ! ref($obj) || ! $obj->isa('Bio::Factory::SequenceFactoryI') ) {
666 $self->throw("Must provide a valid Bio::Factory::SequenceFactoryI object to ".ref($self)."::sequence_factory()");
667 }
668 $self->{'_seqio_seqfactory'} = $obj;
669 my $builder = $self->sequence_builder();
670 if($builder && $builder->can('sequence_factory') &&
671 (! $builder->sequence_factory())) {
672 $builder->sequence_factory($obj);
673 }
674 }
675 $self->{'_seqio_seqfactory'};
676 }
677
678 =head2 object_factory
679
680 Title : object_factory
681 Usage : $obj->object_factory($newval)
682 Function: This is an alias to sequence_factory with a more generic name.
683 Example :
684 Returns : value of object_factory (a scalar)
685 Args : on set, new value (a scalar or undef, optional)
686
687
688 =cut
689
690 sub object_factory{
691 return shift->sequence_factory(@_);
692 }
693
694 =head2 sequence_builder
695
696 Title : sequence_builder
697 Usage : $seqio->sequence_builder($seqfactory)
698 Function: Get/Set the L<Bio::Factory::ObjectBuilderI> used to build sequence
699 objects.
700
701 If you do not set the sequence object builder yourself, it
702 will in fact be an instance of L<Bio::Seq::SeqBuilder>, and
703 you may use all methods documented there to configure it.
704
705 Returns : a L<Bio::Factory::ObjectBuilderI> compliant object
706 Args : [optional] a L<Bio::Factory::ObjectBuilderI> compliant object
707
708
709 =cut
710
711 sub sequence_builder{
712 my ($self,$obj) = @_;
713 if( defined $obj ) {
714 if( ! ref($obj) || ! $obj->isa('Bio::Factory::ObjectBuilderI') ) {
715 $self->throw("Must provide a valid Bio::Factory::ObjectBuilderI object to ".ref($self)."::sequence_builder()");
716 }
717 $self->{'_object_builder'} = $obj;
718 }
719 $self->{'_object_builder'};
720 }
721
722 =head2 location_factory
723
724 Title : location_factory
725 Usage : $seqio->location_factory($locfactory)
726 Function: Get/Set the Bio::Factory::LocationFactoryI object to be used for
727 location string parsing
728 Returns : a L<Bio::Factory::LocationFactoryI> implementing object
729 Args : [optional] on set, a L<Bio::Factory::LocationFactoryI> implementing
730 object.
731
732
733 =cut
734
735 sub location_factory{
736 my ($self,$obj) = @_;
737 if( defined $obj ) {
738 if( ! ref($obj) || ! $obj->isa('Bio::Factory::LocationFactoryI') ) {
739 $self->throw("Must provide a valid Bio::Factory::LocationFactoryI".
740 " object to ".ref($self)."->location_factory()");
741 }
742 $self->{'_seqio_locfactory'} = $obj;
743 }
744 $self->{'_seqio_locfactory'};
745 }
746
747 1;
748