0
|
1 # $Id: fasta.pm,v 1.41.2.4 2003/09/18 02:43:16 jason Exp $
|
|
2 # BioPerl module for Bio::SeqIO::fasta
|
|
3 #
|
|
4 # Cared for by Ewan Birney <birney@ebi.ac.uk>
|
|
5 # and Lincoln Stein <lstein@cshl.org>
|
|
6 #
|
|
7 # Copyright Ewan Birney & Lincoln Stein
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 # _history
|
|
11 # October 18, 1999 Largely rewritten by Lincoln Stein
|
|
12
|
|
13 # POD documentation - main docs before the code
|
|
14
|
|
15 =head1 NAME
|
|
16
|
|
17 Bio::SeqIO::fasta - fasta sequence input/output stream
|
|
18
|
|
19 =head1 SYNOPSIS
|
|
20
|
|
21 Do not use this module directly. Use it via the Bio::SeqIO class.
|
|
22
|
|
23 =head1 DESCRIPTION
|
|
24
|
|
25 This object can transform Bio::Seq objects to and from fasta flat
|
|
26 file databases.
|
|
27
|
|
28 A method L<preferred_id_type()> can be used to specify the type of ID
|
|
29 we would like to parse from the fasta line. By default 'display' is
|
|
30 used, which means it parses everything from the '>' to the first space
|
|
31 and makes that the 'display_id' for the sequence.
|
|
32
|
|
33 Can be one of:
|
|
34 - accession
|
|
35 - accession.version
|
|
36 - display
|
|
37 - primary
|
|
38
|
|
39 =head1 FEEDBACK
|
|
40
|
|
41 =head2 Mailing Lists
|
|
42
|
|
43 User feedback is an integral part of the evolution of this and other
|
|
44 Bioperl modules. Send your comments and suggestions preferably to one
|
|
45 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
46
|
|
47 bioperl-l@bioperl.org - General discussion
|
|
48 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
49
|
|
50 =head2 Reporting Bugs
|
|
51
|
|
52 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
53 the bugs and their resolution. Bug reports can be submitted via the
|
|
54 web:
|
|
55
|
|
56 http://bugzilla.bioperl.org/
|
|
57
|
|
58 =head1 AUTHORS - Ewan Birney & Lincoln Stein
|
|
59
|
|
60 Email: birney@ebi.ac.uk
|
|
61 lstein@cshl.org
|
|
62
|
|
63 =head1 CONTRIBUTORS
|
|
64
|
|
65 Jason Stajich, jason-at-bioperl.org
|
|
66
|
|
67 =head1 APPENDIX
|
|
68
|
|
69 The rest of the documentation details each of the object
|
|
70 methods. Internal methods are usually preceded with a _
|
|
71
|
|
72 =cut
|
|
73
|
|
74 # Let the code begin...
|
|
75
|
|
76 package Bio::SeqIO::fasta;
|
|
77 use vars qw(@ISA $WIDTH @SEQ_ID_TYPES $DEFAULT_SEQ_ID_TYPE);
|
|
78 use strict;
|
|
79 # Object preamble - inherits from Bio::Root::Object
|
|
80
|
|
81 use Bio::SeqIO;
|
|
82 use Bio::Seq::SeqFactory;
|
|
83 use Bio::Seq::SeqFastaSpeedFactory;
|
|
84
|
|
85 @ISA = qw(Bio::SeqIO);
|
|
86
|
|
87 @SEQ_ID_TYPES = qw(accession accession.version display primary);
|
|
88 $DEFAULT_SEQ_ID_TYPE = 'display';
|
|
89
|
|
90 BEGIN { $WIDTH = 60}
|
|
91
|
|
92 sub _initialize {
|
|
93 my($self,@args) = @_;
|
|
94 $self->SUPER::_initialize(@args);
|
|
95 my ($width) = $self->_rearrange([qw(WIDTH)], @args);
|
|
96 $width && $self->width($width);
|
|
97 unless ( defined $self->sequence_factory ) {
|
|
98 $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new());
|
|
99 }
|
|
100 }
|
|
101
|
|
102 =head2 next_seq
|
|
103
|
|
104 Title : next_seq
|
|
105 Usage : $seq = $stream->next_seq()
|
|
106 Function: returns the next sequence in the stream
|
|
107 Returns : Bio::Seq object
|
|
108 Args : NONE
|
|
109
|
|
110 =cut
|
|
111
|
|
112 sub next_seq {
|
|
113 my( $self ) = @_;
|
|
114 my $seq;
|
|
115 my $alphabet;
|
|
116 local $/ = "\n>";
|
|
117 return unless my $entry = $self->_readline;
|
|
118
|
|
119 chomp($entry);
|
|
120 if ($entry =~ m/\A\s*\Z/s) { # very first one
|
|
121 return unless $entry = $self->_readline;
|
|
122 chomp($entry);
|
|
123 }
|
|
124 $entry =~ s/^>//;
|
|
125
|
|
126 my ($top,$sequence) = split(/\n/,$entry,2);
|
|
127 defined $sequence && $sequence =~ s/>//g;
|
|
128 # my ($top,$sequence) = $entry =~ /^>?(.+?)\n+([^>]*)/s
|
|
129 # or $self->throw("Can't parse fasta entry");
|
|
130
|
|
131 my ($id,$fulldesc);
|
|
132 if( $top =~ /^\s*(\S+)\s*(.*)/ ) {
|
|
133 ($id,$fulldesc) = ($1,$2);
|
|
134 }
|
|
135
|
|
136 if (defined $id && $id eq '') {$id=$fulldesc;} # FIX incase no space
|
|
137 # between > and name \AE
|
|
138 defined $sequence && $sequence =~ s/\s//g; # Remove whitespace
|
|
139
|
|
140 # for empty sequences we need to know the mol.type
|
|
141 $alphabet = $self->alphabet();
|
|
142 if(defined $sequence && length($sequence) == 0) {
|
|
143 if(! defined($alphabet)) {
|
|
144 # let's default to dna
|
|
145 $alphabet = "dna";
|
|
146 }
|
|
147 } else {
|
|
148 # we don't need it really, so disable
|
|
149 $alphabet = undef;
|
|
150 }
|
|
151
|
|
152 $seq = $self->sequence_factory->create(
|
|
153 -seq => $sequence,
|
|
154 -id => $id,
|
|
155 # Ewan's note - I don't think this healthy
|
|
156 # but obviously to taste.
|
|
157 #-primary_id => $id,
|
|
158 -desc => $fulldesc,
|
|
159 -alphabet => $alphabet,
|
|
160 -direct => 1,
|
|
161 );
|
|
162
|
|
163
|
|
164
|
|
165
|
|
166 # if there wasn't one before, set the guessed type
|
|
167 unless ( defined $alphabet ) {
|
|
168 $self->alphabet($seq->alphabet());
|
|
169 }
|
|
170 return $seq;
|
|
171
|
|
172 }
|
|
173
|
|
174 =head2 write_seq
|
|
175
|
|
176 Title : write_seq
|
|
177 Usage : $stream->write_seq(@seq)
|
|
178 Function: writes the $seq object into the stream
|
|
179 Returns : 1 for success and 0 for error
|
|
180 Args : array of 1 to n Bio::PrimarySeqI objects
|
|
181
|
|
182
|
|
183 =cut
|
|
184
|
|
185 sub write_seq {
|
|
186 my ($self,@seq) = @_;
|
|
187 my $width = $self->width;
|
|
188 foreach my $seq (@seq) {
|
|
189 $self->throw("Did not provide a valid Bio::PrimarySeqI object")
|
|
190 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
|
|
191
|
|
192 my $str = $seq->seq;
|
|
193 my $top;
|
|
194
|
|
195 # Allow for different ids
|
|
196 my $id_type = $self->preferred_id_type;
|
|
197 if( $id_type =~ /^acc/i ) {
|
|
198 $top = $seq->accession_number();
|
|
199 if( $id_type =~ /vers/i ) {
|
|
200 $top .= "." . $seq->version();
|
|
201 }
|
|
202 } elsif($id_type =~ /^displ/i ) {
|
|
203 $top = $seq->display_id();
|
|
204 } elsif($id_type =~ /^pri/i ) {
|
|
205 $top = $seq->primary_id();
|
|
206 }
|
|
207
|
|
208 if ($seq->can('desc') and my $desc = $seq->desc()) {
|
|
209 $desc =~ s/\n//g;
|
|
210 $top .= " $desc";
|
|
211 }
|
|
212 if(length($str) > 0) {
|
|
213 $str =~ s/(.{1,$width})/$1\n/g;
|
|
214 } else {
|
|
215 $str = "\n";
|
|
216 }
|
|
217 $self->_print (">",$top,"\n",$str) or return;
|
|
218 }
|
|
219
|
|
220 $self->flush if $self->_flush_on_write && defined $self->_fh;
|
|
221 return 1;
|
|
222 }
|
|
223
|
|
224 =head2 width
|
|
225
|
|
226 Title : width
|
|
227 Usage : $obj->width($newval)
|
|
228 Function: Get/Set the line width for FASTA output
|
|
229 Returns : value of width
|
|
230 Args : newvalue (optional)
|
|
231
|
|
232
|
|
233 =cut
|
|
234
|
|
235 sub width{
|
|
236 my ($self,$value) = @_;
|
|
237 if( defined $value) {
|
|
238 $self->{'width'} = $value;
|
|
239 }
|
|
240 return $self->{'width'} || $WIDTH;
|
|
241 }
|
|
242
|
|
243 =head2 preferred_id_type
|
|
244
|
|
245 Title : preferred_id_type
|
|
246 Usage : $obj->preferred_id_type('accession')
|
|
247 Function: Get/Set the preferred type of identifier to use in the ">ID" position
|
|
248 for FASTA output.
|
|
249 Returns : string, one of values defined in @Bio::SeqIO::fasta::SEQ_ID_TYPES.
|
|
250 Default = $Bio::SeqIO::fasta::DEFAULT_SEQ_ID_TYPE ('display').
|
|
251 Args : string when setting. This must be one of values defined in
|
|
252 @Bio::SeqIO::fasta::SEQ_ID_TYPES. Allowable values:
|
|
253 accession, accession.version, display, primary
|
|
254 Throws : fatal exception if the supplied id type is not in @SEQ_ID_TYPES.
|
|
255
|
|
256 =cut
|
|
257
|
|
258 sub preferred_id_type {
|
|
259 my ($self,$type) = @_;
|
|
260 if( defined $type ) {
|
|
261 if( ! grep lc($type) eq $_, @SEQ_ID_TYPES) {
|
|
262 $self->throw(-class=>'Bio::Root::BadParameter',
|
|
263 -text=>"Invalid ID type \"$type\". Must be one of: @SEQ_ID_TYPES");
|
|
264 }
|
|
265 $self->{'_seq_id_type'} = lc($type);
|
|
266 # print STDERR "Setting preferred_id_type=$type\n";
|
|
267 }
|
|
268 $self->{'_seq_id_type'} || $DEFAULT_SEQ_ID_TYPE;
|
|
269 }
|
|
270
|
|
271 1;
|