0
|
1 # $Id: largefasta.pm,v 1.18 2002/12/27 19:42:32 birney Exp $
|
|
2 # BioPerl module for Bio::SeqIO::largefasta
|
|
3 #
|
|
4 # Cared for by Jason Stajich
|
|
5 #
|
|
6 # Copyright Jason Stajich
|
|
7 #
|
|
8 # You may distribute this module under the same terms as perl itself
|
|
9 # _history
|
|
10 #
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SeqIO::largefasta - method i/o on very large fasta sequence files
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 Do not use this module directly. Use it via the Bio::SeqIO class.
|
|
20
|
|
21 =head1 DESCRIPTION
|
|
22
|
|
23 This object can transform Bio::Seq objects to and from fasta flat
|
|
24 file databases.
|
|
25
|
|
26 This module handles very large sequence files by using the
|
|
27 Bio::Seq::LargePrimarySeq module to store all the sequence data in
|
|
28 a file. This can be a problem if you have limited disk space on your
|
|
29 computer because this will effectively cause 2 copies of the sequence
|
|
30 file to reside on disk for the life of the
|
|
31 Bio::Seq::LargePrimarySeq object. The default location for this is
|
|
32 specified by the L<File::Spec>-E<gt>tmpdir routine which is usually /tmp
|
|
33 on UNIX. If a sequence file is larger than the swap space (capacity
|
|
34 of the /tmp dir) this could cause problems for the machine. It is
|
|
35 possible to set the directory where the temporary file is located by
|
|
36 adding the following line to your code BEFORE calling next_seq. See
|
|
37 L<Bio::Seq::LargePrimarySeq> for more information.
|
|
38
|
|
39 $Bio::Seq::LargePrimarySeq::DEFAULT_TEMP_DIR = 'newdir';
|
|
40
|
|
41 =head1 FEEDBACK
|
|
42
|
|
43 =head2 Mailing Lists
|
|
44
|
|
45 User feedback is an integral part of the evolution of this and other
|
|
46 Bioperl modules. Send your comments and suggestions preferably to one
|
|
47 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
48
|
|
49 bioperl-l@bioperl.org - General discussion
|
|
50 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
51
|
|
52 =head2 Reporting Bugs
|
|
53
|
|
54 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
55 the bugs and their resolution. Bug reports can be submitted via email
|
|
56 or the web:
|
|
57
|
|
58 bioperl-bugs@bio.perl.org
|
|
59 http://bugzilla.bioperl.org/
|
|
60
|
|
61 =head1 AUTHORS - Jason Stajich
|
|
62
|
|
63 Email: jason@bioperl.org
|
|
64
|
|
65 =head1 APPENDIX
|
|
66
|
|
67 The rest of the documentation details each of the object
|
|
68 methods. Internal methods are usually preceded with a _
|
|
69
|
|
70 =cut
|
|
71
|
|
72 # Let the code begin...
|
|
73
|
|
74 package Bio::SeqIO::largefasta;
|
|
75 use vars qw(@ISA $FASTALINELEN);
|
|
76 use strict;
|
|
77 # Object preamble - inherits from Bio::Root::Object
|
|
78
|
|
79 use Bio::SeqIO;
|
|
80 use Bio::Seq::SeqFactory;
|
|
81
|
|
82 $FASTALINELEN = 60;
|
|
83 @ISA = qw(Bio::SeqIO);
|
|
84
|
|
85 sub _initialize {
|
|
86 my($self,@args) = @_;
|
|
87 $self->SUPER::_initialize(@args);
|
|
88 if( ! defined $self->sequence_factory ) {
|
|
89 $self->sequence_factory(new Bio::Seq::SeqFactory
|
|
90 (-verbose => $self->verbose(),
|
|
91 -type => 'Bio::Seq::LargePrimarySeq'));
|
|
92 }
|
|
93 }
|
|
94
|
|
95 =head2 next_seq
|
|
96
|
|
97 Title : next_seq
|
|
98 Usage : $seq = $stream->next_seq()
|
|
99 Function: returns the next sequence in the stream
|
|
100 Returns : Bio::Seq object
|
|
101 Args : NONE
|
|
102
|
|
103 =cut
|
|
104
|
|
105 sub next_seq {
|
|
106 my ($self) = @_;
|
|
107 # local $/ = "\n";
|
|
108 my $largeseq = $self->sequence_factory->create();
|
|
109 my ($id,$fulldesc,$entry);
|
|
110 my $count = 0;
|
|
111 my $seen = 0;
|
|
112 while( defined ($entry = $self->_readline) ) {
|
|
113 if( $seen == 1 && $entry =~ /^\s*>/ ) {
|
|
114 $self->_pushback($entry);
|
|
115 return $largeseq;
|
|
116 }
|
|
117 # if ( ($entry eq '>') || eof($self->_fh) ) { $seen = 1; next; }
|
|
118 if ( ($entry eq '>') ) { $seen = 1; next; }
|
|
119 elsif( $entry =~ /\s*>(.+?)$/ ) {
|
|
120 $seen = 1;
|
|
121 ($id,$fulldesc) = ($1 =~ /^\s*(\S+)\s*(.*)$/)
|
|
122 or $self->warn("Can't parse fasta header");
|
|
123 $largeseq->display_id($id);
|
|
124 $largeseq->primary_id($id);
|
|
125 $largeseq->desc($fulldesc);
|
|
126 } else {
|
|
127 $entry =~ s/\s+//g;
|
|
128 $largeseq->add_sequence_as_string($entry);
|
|
129 }
|
|
130 (++$count % 1000 == 0 && $self->verbose() > 0) && print "line $count\n";
|
|
131 }
|
|
132 if( ! $seen ) { return undef; }
|
|
133 return $largeseq;
|
|
134 }
|
|
135
|
|
136 =head2 write_seq
|
|
137
|
|
138 Title : write_seq
|
|
139 Usage : $stream->write_seq(@seq)
|
|
140 Function: writes the $seq object into the stream
|
|
141 Returns : 1 for success and 0 for error
|
|
142 Args : Bio::Seq object
|
|
143
|
|
144
|
|
145 =cut
|
|
146
|
|
147 sub write_seq {
|
|
148 my ($self,@seq) = @_;
|
|
149 foreach my $seq (@seq) {
|
|
150 my $top = $seq->id();
|
|
151 if ($seq->can('desc') and my $desc = $seq->desc()) {
|
|
152 $desc =~ s/\n//g;
|
|
153 $top .= " $desc";
|
|
154 }
|
|
155 $self->_print (">",$top,"\n");
|
|
156 my $end = $seq->length();
|
|
157 my $start = 1;
|
|
158 while( $start < $end ) {
|
|
159 my $stop = $start + $FASTALINELEN - 1;
|
|
160 $stop = $end if( $stop > $end );
|
|
161 $self->_print($seq->subseq($start,$stop), "\n");
|
|
162 $start += $FASTALINELEN;
|
|
163 }
|
|
164 }
|
|
165
|
|
166 $self->flush if $self->_flush_on_write && defined $self->_fh;
|
|
167 return 1;
|
|
168 }
|
|
169
|
|
170 1;
|