Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Structure/IO.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: IO.pm,v 1.3 2002/10/22 07:45:21 lapp Exp $ | |
2 # | |
3 # BioPerl module for Bio::Structure::IO | |
4 # | |
5 # Cared for by Ewan Birney <birney@sanger.ac.uk> | |
6 # and Lincoln Stein <lstein@cshl.org> | |
7 # and Kris Boulez <kris.boulez@algonomics.com> | |
8 # | |
9 # Copyright 2001, 2002 Kris Boulez | |
10 # | |
11 # You may distribute this module under the same terms as perl itself | |
12 # | |
13 # _history | |
14 # October 18, 1999 Largely rewritten by Lincoln Stein | |
15 # November 16, 2001 Copied Bio::SeqIO to Bio::Structure::IO and modified | |
16 # where needed. Factoring out common methods | |
17 # (to Bio::Root::IO) might be a good idea. | |
18 | |
19 # POD documentation - main docs before the code | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::Structure::IO - Handler for Structure Formats | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 use Bio::Structure::IO; | |
28 | |
29 $in = Bio::Structure::IO->new(-file => "inputfilename" , '-format' => 'pdb'); | |
30 $out = Bio::Structure::IO->new(-file => ">outputfilename" , '-format' => 'pdb'); | |
31 # note: we quote -format to keep older perl's from complaining. | |
32 | |
33 while ( my $struc = $in->next_structure() ) { | |
34 $out->write_structure($struc); | |
35 } | |
36 | |
37 now, to actually get at the structure object, use the standard Bio::Structure | |
38 methods (look at L<Bio::Structure> if you don't know what they are) | |
39 | |
40 use Bio::Structure::IO; | |
41 | |
42 $in = Bio::Structure::IO->new(-file => "inputfilename" , '-format' => 'pdb'); | |
43 | |
44 while ( my $struc = $in->next_structure() ) { | |
45 print "Structure ",$struc->id," number of models: ",scalar $struc->model,"\n"; | |
46 } | |
47 | |
48 | |
49 | |
50 =head1 DESCRIPTION | |
51 | |
52 [ The following description is a copy-paste from the Bio::SeqIO description. | |
53 This is not surprising as the code is also mostly a copy. ] | |
54 | |
55 Bio::Structure::IO is a handler module for the formats in the Structure::IO set | |
56 (eg, Bio::Structure::IO::pdb). It is the officially sanctioned way of getting at | |
57 the format objects, which most people should use. | |
58 | |
59 The Bio::Structure::IO system can be thought of like biological file handles. | |
60 They are attached to filehandles with smart formatting rules (eg, PDB format) | |
61 and can either read or write structure objects (Bio::Structure objects, or | |
62 more correctly, Bio::Structure::StructureI implementing objects, of which | |
63 Bio::Structure is one such object). If you want to know what to do with a | |
64 Bio::Structure object, read L<Bio::Structure> | |
65 | |
66 The idea is that you request a stream object for a particular format. | |
67 All the stream objects have a notion of an internal file that is read | |
68 from or written to. A particular Structure::IO object instance is configured | |
69 for either input or output. A specific example of a stream object is | |
70 the Bio::Structure::IO::pdb object. | |
71 | |
72 Each stream object has functions | |
73 | |
74 $stream->next_structure(); | |
75 | |
76 and | |
77 | |
78 $stream->write_structure($struc); | |
79 | |
80 also | |
81 | |
82 $stream->type() # returns 'INPUT' or 'OUTPUT' | |
83 | |
84 As an added bonus, you can recover a filehandle that is tied to the | |
85 Structure::IOIO object, allowing you to use the standard E<lt>E<gt> and print operations | |
86 to read and write structure::IOuence objects: | |
87 | |
88 use Bio::Structure::IO; | |
89 | |
90 $stream = Bio::Structure::IO->newFh(-format => 'pdb'); # read from standard input | |
91 | |
92 while ( $structure = <$stream> ) { | |
93 # do something with $structure | |
94 } | |
95 | |
96 and | |
97 | |
98 print $stream $structure; # when stream is in output mode | |
99 | |
100 | |
101 =head1 CONSTRUCTORS | |
102 | |
103 =head2 Bio::Structure::IO-E<gt>new() | |
104 | |
105 $stream = Bio::Structure::IO->new(-file => 'filename', -format=>$format); | |
106 $stream = Bio::Structure::IO->new(-fh => \*FILEHANDLE, -format=>$format); | |
107 $stream = Bio::Structure::IO->new(-format => $format); | |
108 | |
109 The new() class method constructs a new Bio::Structure::IO object. The | |
110 returned object can be used to retrieve or print Bio::Structure objects. | |
111 new() accepts the following parameters: | |
112 | |
113 =over 4 | |
114 | |
115 =item -file | |
116 | |
117 A file path to be opened for reading or writing. The usual Perl | |
118 conventions apply: | |
119 | |
120 'file' # open file for reading | |
121 '>file' # open file for writing | |
122 '>>file' # open file for appending | |
123 '+<file' # open file read/write | |
124 'command |' # open a pipe from the command | |
125 '| command' # open a pipe to the command | |
126 | |
127 =item -fh | |
128 | |
129 You may provide new() with a previously-opened filehandle. For | |
130 example, to read from STDIN: | |
131 | |
132 $strucIO = Bio::Structure::IO->new(-fh => \*STDIN); | |
133 | |
134 Note that you must pass filehandles as references to globs. | |
135 | |
136 If neither a filehandle nor a filename is specified, then the module | |
137 will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt> | |
138 semantics. | |
139 | |
140 A string filehandle is handy if you want to modify the output in the | |
141 memory, before printing it out. The following program reads in EMBL | |
142 formatted entries from a file and prints them out in fasta format with | |
143 some HTML tags: | |
144 [ not relevant for Bio::Structure::IO as only one format is supported | |
145 at the moment ] | |
146 | |
147 use Bio::SeqIO; | |
148 use IO::String; | |
149 my $in = Bio::SeqIO->new('-file' => "emblfile" , | |
150 '-format' => 'EMBL'); | |
151 while ( my $seq = $in->next_seq() ) { | |
152 # the output handle is reset for every file | |
153 my $stringio = IO::String->new($string); | |
154 my $out = Bio::SeqIO->new('-fh' => $stringio, | |
155 '-format' => 'fasta'); | |
156 # output goes into $string | |
157 $out->write_seq($seq); | |
158 # modify $string | |
159 $string =~ s|(>)(\w+)|$1<font color="Red">$2</font>|g; | |
160 # print into STDOUT | |
161 print $string; | |
162 } | |
163 | |
164 =item -format | |
165 | |
166 Specify the format of the file. Supported formats include: | |
167 | |
168 PDB Protein Data Bank format | |
169 | |
170 If no format is specified and a filename is given, then the module | |
171 will attempt to deduce it from the filename. If this is unsuccessful, | |
172 PDB format is assumed. | |
173 | |
174 The format name is case insensitive. 'PDB', 'Pdb' and 'pdb' are | |
175 all supported. | |
176 | |
177 =back | |
178 | |
179 =head2 Bio::Structure::IO-E<gt>newFh() | |
180 | |
181 $fh = Bio::Structure::IO->newFh(-fh => \*FILEHANDLE, -format=>$format); | |
182 $fh = Bio::Structure::IO->newFh(-format => $format); | |
183 # etc. | |
184 | |
185 This constructor behaves like new(), but returns a tied filehandle | |
186 rather than a Bio::Structure::IO object. You can read structures from this | |
187 object using the familiar E<lt>E<gt> operator, and write to it using | |
188 print(). The usual array and $_ semantics work. For example, you can | |
189 read all structure objects into an array like this: | |
190 | |
191 @structures = <$fh>; | |
192 | |
193 Other operations, such as read(), sysread(), write(), close(), and printf() | |
194 are not supported. | |
195 | |
196 =head1 OBJECT METHODS | |
197 | |
198 See below for more detailed summaries. The main methods are: | |
199 | |
200 =head2 $structure = $structIO-E<gt>next_structure() | |
201 | |
202 Fetch the next structure from the stream. | |
203 | |
204 =head2 $structIO-E<gt>write_structure($struc [,$another_struc,...]) | |
205 | |
206 Write the specified structure(s) to the stream. | |
207 | |
208 =head2 TIEHANDLE(), READLINE(), PRINT() | |
209 | |
210 These provide the tie interface. See L<perltie> for more details. | |
211 | |
212 =head1 FEEDBACK | |
213 | |
214 =head2 Mailing Lists | |
215 | |
216 User feedback is an integral part of the evolution of this | |
217 and other Bioperl modules. Send your comments and suggestions preferably | |
218 to one of the Bioperl mailing lists. | |
219 Your participation is much appreciated. | |
220 | |
221 bioperl-l@bioperl.org - General discussion | |
222 http://bioperl.org/MailList.shtml - About the mailing lists | |
223 | |
224 =head2 Reporting Bugs | |
225 | |
226 Report bugs to the Bioperl bug tracking system to help us keep track | |
227 the bugs and their resolution. | |
228 Bug reports can be submitted via email or the web: | |
229 | |
230 bioperl-bugs@bioperl.org | |
231 http://bugzilla.bioperl.org/ | |
232 | |
233 =head1 AUTHOR - Ewan Birney, Lincoln Stein, Kris Boulez | |
234 | |
235 Email birney@ebi.ac.uk, kris.boulez@algonomics | |
236 | |
237 Describe contact details here | |
238 | |
239 =head1 APPENDIX | |
240 | |
241 The rest of the documentation details each of the object | |
242 methods. Internal methods are usually preceded with a _ | |
243 | |
244 =cut | |
245 | |
246 # Let the code begin... | |
247 | |
248 package Bio::Structure::IO; | |
249 | |
250 use strict; | |
251 use vars qw(@ISA); | |
252 | |
253 use Bio::Root::Root; | |
254 use Bio::Root::IO; | |
255 use Bio::PrimarySeq; | |
256 use Symbol(); | |
257 | |
258 @ISA = qw(Bio::Root::Root Bio::Root::IO); | |
259 | |
260 =head2 new | |
261 | |
262 Title : new | |
263 Usage : $stream = Bio::Structure::IO->new(-file => $filename, -format => 'Format') | |
264 Function: Returns a new structIOstream | |
265 Returns : A Bio::Structure::IO handler initialised with the appropriate format | |
266 Args : -file => $filename | |
267 -format => format | |
268 -fh => filehandle to attach to | |
269 | |
270 =cut | |
271 | |
272 my $entry = 0; | |
273 | |
274 sub new { | |
275 my ($caller,@args) = @_; | |
276 my $class = ref($caller) || $caller; | |
277 | |
278 # or do we want to call SUPER on an object if $caller is an | |
279 # object? | |
280 if( $class =~ /Bio::Structure::IO::(\S+)/ ) { | |
281 my ($self) = $class->SUPER::new(@args); | |
282 $self->_initialize(@args); | |
283 return $self; | |
284 } else { | |
285 | |
286 my %param = @args; | |
287 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys | |
288 my $format = $param{'-format'} || | |
289 $class->_guess_format( $param{-file} || $ARGV[0] ) || | |
290 'pdb'; | |
291 $format = "\L$format"; # normalize capitalization to lower case | |
292 | |
293 # normalize capitalization | |
294 return undef unless( &_load_format_module($format) ); | |
295 return "Bio::Structure::IO::$format"->new(@args); | |
296 } | |
297 } | |
298 | |
299 =head2 newFh | |
300 | |
301 Title : newFh | |
302 Usage : $fh = Bio::Structure::IO->newFh(-file=>$filename,-format=>'Format') | |
303 Function: does a new() followed by an fh() | |
304 Example : $fh = Bio::Structure::IO->newFh(-file=>$filename,-format=>'Format') | |
305 $structure = <$fh>; # read a structure object | |
306 print $fh $structure; # write a structure object | |
307 Returns : filehandle tied to the Bio::Structure::IO::Fh class | |
308 Args : | |
309 | |
310 =cut | |
311 | |
312 sub newFh { | |
313 my $class = shift; | |
314 return unless my $self = $class->new(@_); | |
315 return $self->fh; | |
316 } | |
317 | |
318 =head2 fh | |
319 | |
320 Title : fh | |
321 Usage : $obj->fh | |
322 Function: | |
323 Example : $fh = $obj->fh; # make a tied filehandle | |
324 $structure = <$fh>; # read a structure object | |
325 print $fh $structure; # write a structure object | |
326 Returns : filehandle tied to the Bio::Structure::IO::Fh class | |
327 Args : | |
328 | |
329 =cut | |
330 | |
331 | |
332 sub fh { | |
333 my $self = shift; | |
334 my $class = ref($self) || $self; | |
335 my $s = Symbol::gensym; | |
336 tie $$s,$class,$self; | |
337 return $s; | |
338 } | |
339 | |
340 | |
341 # _initialize is chained for all SeqIO classes | |
342 | |
343 sub _initialize { | |
344 my($self, @args) = @_; | |
345 | |
346 # not really necessary unless we put more in RootI | |
347 $self->SUPER::_initialize(@args); | |
348 | |
349 # initialize the IO part | |
350 $self->_initialize_io(@args); | |
351 } | |
352 | |
353 =head2 next_structure | |
354 | |
355 Title : next_structure | |
356 Usage : $structure = stream->next_structure | |
357 Function: Reads the next structure object from the stream and returns it. | |
358 | |
359 Certain driver modules may encounter entries in the stream that | |
360 are either misformatted or that use syntax not yet understood | |
361 by the driver. If such an incident is recoverable, e.g., by | |
362 dismissing a feature of a feature table or some other non-mandatory | |
363 part of an entry, the driver will issue a warning. In the case | |
364 of a non-recoverable situation an exception will be thrown. | |
365 Do not assume that you can resume parsing the same stream after | |
366 catching the exception. Note that you can always turn recoverable | |
367 errors into exceptions by calling $stream->verbose(2) (see | |
368 Bio::RootI POD page). | |
369 Returns : a Bio::Structure structure object | |
370 Args : none | |
371 | |
372 =cut | |
373 | |
374 sub next_structure { | |
375 my ($self, $struc) = @_; | |
376 $self->throw("Sorry, you cannot read from a generic Bio::Structure::IO object."); | |
377 } | |
378 | |
379 # Do we want people to read out the sequence directly from a $structIO stream | |
380 # | |
381 ##=head2 next_primary_seq | |
382 ## | |
383 ## Title : next_primary_seq | |
384 ## Usage : $seq = $stream->next_primary_seq | |
385 ## Function: Provides a primaryseq type of sequence object | |
386 ## Returns : A Bio::PrimarySeqI object | |
387 ## Args : none | |
388 ## | |
389 ## | |
390 ##=cut | |
391 ## | |
392 ##sub next_primary_seq { | |
393 ## my ($self) = @_; | |
394 ## | |
395 ## # in this case, we default to next_seq. This is because | |
396 ## # Bio::Seq's are Bio::PrimarySeqI objects. However we | |
397 ## # expect certain sub classes to override this method to provide | |
398 ## # less parsing heavy methods to retrieving the objects | |
399 ## | |
400 ## return $self->next_seq(); | |
401 ##} | |
402 | |
403 =head2 write_structure | |
404 | |
405 Title : write_structure | |
406 Usage : $stream->write_structure($structure) | |
407 Function: writes the $structure object into the stream | |
408 Returns : 1 for success and 0 for error | |
409 Args : Bio::Structure object | |
410 | |
411 =cut | |
412 | |
413 sub write_seq { | |
414 my ($self, $struc) = @_; | |
415 $self->throw("Sorry, you cannot write to a generic Bio::Structure::IO object."); | |
416 } | |
417 | |
418 | |
419 # De we need this here | |
420 # | |
421 ##=head2 alphabet | |
422 ## | |
423 ## Title : alphabet | |
424 ## Usage : $self->alphabet($newval) | |
425 ## Function: Set/get the molecule type for the Seq objects to be created. | |
426 ## Example : $seqio->alphabet('protein') | |
427 ## Returns : value of alphabet: 'dna', 'rna', or 'protein' | |
428 ## Args : newvalue (optional) | |
429 ## Throws : Exception if the argument is not one of 'dna', 'rna', or 'protein' | |
430 ## | |
431 ##=cut | |
432 ## | |
433 ##sub alphabet { | |
434 ## my ($self, $value) = @_; | |
435 ## | |
436 ## if ( defined $value) { | |
437 ## # instead of hard-coding the allowed values once more, we check by | |
438 ## # creating a dummy sequence object | |
439 ## eval { | |
440 ## my $seq = Bio::PrimarySeq->new('-alphabet' => $value); | |
441 ## }; | |
442 ## if($@) { | |
443 ## $self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values."); | |
444 ## } | |
445 ## $self->{'alphabet'} = "\L$value"; | |
446 ## } | |
447 ## return $self->{'alphabet'}; | |
448 ##} | |
449 | |
450 =head2 _load_format_module | |
451 | |
452 Title : _load_format_module | |
453 Usage : *INTERNAL Structure::IO stuff* | |
454 Function: Loads up (like use) a module at run time on demand | |
455 Example : | |
456 Returns : | |
457 Args : | |
458 | |
459 =cut | |
460 | |
461 sub _load_format_module { | |
462 my ($format) = @_; | |
463 my ($module, $load, $m); | |
464 | |
465 $module = "_<Bio/Structure/IO/$format.pm"; | |
466 $load = "Bio/Structure/IO/$format.pm"; | |
467 | |
468 return 1 if $main::{$module}; | |
469 eval { | |
470 require $load; | |
471 }; | |
472 if ( $@ ) { | |
473 print STDERR <<END; | |
474 $load: $format cannot be found | |
475 Exception $@ | |
476 For more information about the Structure::IO system please see the | |
477 Bio::Structure::IO docs. This includes ways of checking for formats at | |
478 compile time, not run time | |
479 END | |
480 ; | |
481 return; | |
482 } | |
483 return 1; | |
484 } | |
485 | |
486 =head2 _concatenate_lines | |
487 | |
488 Title : _concatenate_lines | |
489 Usage : $s = _concatenate_lines($line, $continuation_line) | |
490 Function: Private. Concatenates two strings assuming that the second stems | |
491 from a continuation line of the first. Adds a space between both | |
492 unless the first ends with a dash. | |
493 | |
494 Takes care of either arg being empty. | |
495 Example : | |
496 Returns : A string. | |
497 Args : | |
498 | |
499 =cut | |
500 | |
501 sub _concatenate_lines { | |
502 my ($self, $s1, $s2) = @_; | |
503 $s1 .= " " if($s1 && ($s1 !~ /-$/) && $s2); | |
504 return ($s1 ? $s1 : "") . ($s2 ? $s2 : ""); | |
505 } | |
506 | |
507 =head2 _filehandle | |
508 | |
509 Title : _filehandle | |
510 Usage : $obj->_filehandle($newval) | |
511 Function: This method is deprecated. Call _fh() instead. | |
512 Example : | |
513 Returns : value of _filehandle | |
514 Args : newvalue (optional) | |
515 | |
516 | |
517 =cut | |
518 | |
519 sub _filehandle { | |
520 my ($self,@args) = @_; | |
521 return $self->_fh(@args); | |
522 } | |
523 | |
524 =head2 _guess_format | |
525 | |
526 Title : _guess_format | |
527 Usage : $obj->_guess_format($filename) | |
528 Function: | |
529 Example : | |
530 Returns : guessed format of filename (lower case) | |
531 Args : | |
532 | |
533 =cut | |
534 | |
535 sub _guess_format { | |
536 my $class = shift; | |
537 return unless $_ = shift; | |
538 return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i; | |
539 return 'genbank' if /\.(gb|gbank|genbank)$/i; | |
540 return 'scf' if /\.scf$/i; | |
541 return 'pir' if /\.pir$/i; | |
542 return 'embl' if /\.(embl|ebl|emb|dat)$/i; | |
543 return 'raw' if /\.(txt)$/i; | |
544 return 'gcg' if /\.gcg$/i; | |
545 return 'ace' if /\.ace$/i; | |
546 return 'bsml' if /\.(bsm|bsml)$/i; | |
547 return 'pdb' if /\.(ent|pdb)$/i; | |
548 } | |
549 | |
550 sub DESTROY { | |
551 my $self = shift; | |
552 | |
553 $self->close(); | |
554 } | |
555 | |
556 sub TIEHANDLE { | |
557 my ($class,$val) = @_; | |
558 return bless {'structio' => $val}, $class; | |
559 } | |
560 | |
561 sub READLINE { | |
562 my $self = shift; | |
563 return $self->{'structio'}->next_seq() unless wantarray; | |
564 my (@list, $obj); | |
565 push @list, $obj while $obj = $self->{'structio'}->next_seq(); | |
566 return @list; | |
567 } | |
568 | |
569 sub PRINT { | |
570 my $self = shift; | |
571 $self->{'structio'}->write_seq(@_); | |
572 } | |
573 | |
574 1; | |
575 |