Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqIO/fasta.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: fasta.pm,v 1.41.2.4 2003/09/18 02:43:16 jason Exp $ | |
2 # BioPerl module for Bio::SeqIO::fasta | |
3 # | |
4 # Cared for by Ewan Birney <birney@ebi.ac.uk> | |
5 # and Lincoln Stein <lstein@cshl.org> | |
6 # | |
7 # Copyright Ewan Birney & Lincoln Stein | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 # _history | |
11 # October 18, 1999 Largely rewritten by Lincoln Stein | |
12 | |
13 # POD documentation - main docs before the code | |
14 | |
15 =head1 NAME | |
16 | |
17 Bio::SeqIO::fasta - fasta sequence input/output stream | |
18 | |
19 =head1 SYNOPSIS | |
20 | |
21 Do not use this module directly. Use it via the Bio::SeqIO class. | |
22 | |
23 =head1 DESCRIPTION | |
24 | |
25 This object can transform Bio::Seq objects to and from fasta flat | |
26 file databases. | |
27 | |
28 A method L<preferred_id_type()> can be used to specify the type of ID | |
29 we would like to parse from the fasta line. By default 'display' is | |
30 used, which means it parses everything from the '>' to the first space | |
31 and makes that the 'display_id' for the sequence. | |
32 | |
33 Can be one of: | |
34 - accession | |
35 - accession.version | |
36 - display | |
37 - primary | |
38 | |
39 =head1 FEEDBACK | |
40 | |
41 =head2 Mailing Lists | |
42 | |
43 User feedback is an integral part of the evolution of this and other | |
44 Bioperl modules. Send your comments and suggestions preferably to one | |
45 of the Bioperl mailing lists. Your participation is much appreciated. | |
46 | |
47 bioperl-l@bioperl.org - General discussion | |
48 http://bioperl.org/MailList.shtml - About the mailing lists | |
49 | |
50 =head2 Reporting Bugs | |
51 | |
52 Report bugs to the Bioperl bug tracking system to help us keep track | |
53 the bugs and their resolution. Bug reports can be submitted via the | |
54 web: | |
55 | |
56 http://bugzilla.bioperl.org/ | |
57 | |
58 =head1 AUTHORS - Ewan Birney & Lincoln Stein | |
59 | |
60 Email: birney@ebi.ac.uk | |
61 lstein@cshl.org | |
62 | |
63 =head1 CONTRIBUTORS | |
64 | |
65 Jason Stajich, jason-at-bioperl.org | |
66 | |
67 =head1 APPENDIX | |
68 | |
69 The rest of the documentation details each of the object | |
70 methods. Internal methods are usually preceded with a _ | |
71 | |
72 =cut | |
73 | |
74 # Let the code begin... | |
75 | |
76 package Bio::SeqIO::fasta; | |
77 use vars qw(@ISA $WIDTH @SEQ_ID_TYPES $DEFAULT_SEQ_ID_TYPE); | |
78 use strict; | |
79 # Object preamble - inherits from Bio::Root::Object | |
80 | |
81 use Bio::SeqIO; | |
82 use Bio::Seq::SeqFactory; | |
83 use Bio::Seq::SeqFastaSpeedFactory; | |
84 | |
85 @ISA = qw(Bio::SeqIO); | |
86 | |
87 @SEQ_ID_TYPES = qw(accession accession.version display primary); | |
88 $DEFAULT_SEQ_ID_TYPE = 'display'; | |
89 | |
90 BEGIN { $WIDTH = 60} | |
91 | |
92 sub _initialize { | |
93 my($self,@args) = @_; | |
94 $self->SUPER::_initialize(@args); | |
95 my ($width) = $self->_rearrange([qw(WIDTH)], @args); | |
96 $width && $self->width($width); | |
97 unless ( defined $self->sequence_factory ) { | |
98 $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new()); | |
99 } | |
100 } | |
101 | |
102 =head2 next_seq | |
103 | |
104 Title : next_seq | |
105 Usage : $seq = $stream->next_seq() | |
106 Function: returns the next sequence in the stream | |
107 Returns : Bio::Seq object | |
108 Args : NONE | |
109 | |
110 =cut | |
111 | |
112 sub next_seq { | |
113 my( $self ) = @_; | |
114 my $seq; | |
115 my $alphabet; | |
116 local $/ = "\n>"; | |
117 return unless my $entry = $self->_readline; | |
118 | |
119 chomp($entry); | |
120 if ($entry =~ m/\A\s*\Z/s) { # very first one | |
121 return unless $entry = $self->_readline; | |
122 chomp($entry); | |
123 } | |
124 $entry =~ s/^>//; | |
125 | |
126 my ($top,$sequence) = split(/\n/,$entry,2); | |
127 defined $sequence && $sequence =~ s/>//g; | |
128 # my ($top,$sequence) = $entry =~ /^>?(.+?)\n+([^>]*)/s | |
129 # or $self->throw("Can't parse fasta entry"); | |
130 | |
131 my ($id,$fulldesc); | |
132 if( $top =~ /^\s*(\S+)\s*(.*)/ ) { | |
133 ($id,$fulldesc) = ($1,$2); | |
134 } | |
135 | |
136 if (defined $id && $id eq '') {$id=$fulldesc;} # FIX incase no space | |
137 # between > and name \AE | |
138 defined $sequence && $sequence =~ s/\s//g; # Remove whitespace | |
139 | |
140 # for empty sequences we need to know the mol.type | |
141 $alphabet = $self->alphabet(); | |
142 if(defined $sequence && length($sequence) == 0) { | |
143 if(! defined($alphabet)) { | |
144 # let's default to dna | |
145 $alphabet = "dna"; | |
146 } | |
147 } else { | |
148 # we don't need it really, so disable | |
149 $alphabet = undef; | |
150 } | |
151 | |
152 $seq = $self->sequence_factory->create( | |
153 -seq => $sequence, | |
154 -id => $id, | |
155 # Ewan's note - I don't think this healthy | |
156 # but obviously to taste. | |
157 #-primary_id => $id, | |
158 -desc => $fulldesc, | |
159 -alphabet => $alphabet, | |
160 -direct => 1, | |
161 ); | |
162 | |
163 | |
164 | |
165 | |
166 # if there wasn't one before, set the guessed type | |
167 unless ( defined $alphabet ) { | |
168 $self->alphabet($seq->alphabet()); | |
169 } | |
170 return $seq; | |
171 | |
172 } | |
173 | |
174 =head2 write_seq | |
175 | |
176 Title : write_seq | |
177 Usage : $stream->write_seq(@seq) | |
178 Function: writes the $seq object into the stream | |
179 Returns : 1 for success and 0 for error | |
180 Args : array of 1 to n Bio::PrimarySeqI objects | |
181 | |
182 | |
183 =cut | |
184 | |
185 sub write_seq { | |
186 my ($self,@seq) = @_; | |
187 my $width = $self->width; | |
188 foreach my $seq (@seq) { | |
189 $self->throw("Did not provide a valid Bio::PrimarySeqI object") | |
190 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI'); | |
191 | |
192 my $str = $seq->seq; | |
193 my $top; | |
194 | |
195 # Allow for different ids | |
196 my $id_type = $self->preferred_id_type; | |
197 if( $id_type =~ /^acc/i ) { | |
198 $top = $seq->accession_number(); | |
199 if( $id_type =~ /vers/i ) { | |
200 $top .= "." . $seq->version(); | |
201 } | |
202 } elsif($id_type =~ /^displ/i ) { | |
203 $top = $seq->display_id(); | |
204 } elsif($id_type =~ /^pri/i ) { | |
205 $top = $seq->primary_id(); | |
206 } | |
207 | |
208 if ($seq->can('desc') and my $desc = $seq->desc()) { | |
209 $desc =~ s/\n//g; | |
210 $top .= " $desc"; | |
211 } | |
212 if(length($str) > 0) { | |
213 $str =~ s/(.{1,$width})/$1\n/g; | |
214 } else { | |
215 $str = "\n"; | |
216 } | |
217 $self->_print (">",$top,"\n",$str) or return; | |
218 } | |
219 | |
220 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
221 return 1; | |
222 } | |
223 | |
224 =head2 width | |
225 | |
226 Title : width | |
227 Usage : $obj->width($newval) | |
228 Function: Get/Set the line width for FASTA output | |
229 Returns : value of width | |
230 Args : newvalue (optional) | |
231 | |
232 | |
233 =cut | |
234 | |
235 sub width{ | |
236 my ($self,$value) = @_; | |
237 if( defined $value) { | |
238 $self->{'width'} = $value; | |
239 } | |
240 return $self->{'width'} || $WIDTH; | |
241 } | |
242 | |
243 =head2 preferred_id_type | |
244 | |
245 Title : preferred_id_type | |
246 Usage : $obj->preferred_id_type('accession') | |
247 Function: Get/Set the preferred type of identifier to use in the ">ID" position | |
248 for FASTA output. | |
249 Returns : string, one of values defined in @Bio::SeqIO::fasta::SEQ_ID_TYPES. | |
250 Default = $Bio::SeqIO::fasta::DEFAULT_SEQ_ID_TYPE ('display'). | |
251 Args : string when setting. This must be one of values defined in | |
252 @Bio::SeqIO::fasta::SEQ_ID_TYPES. Allowable values: | |
253 accession, accession.version, display, primary | |
254 Throws : fatal exception if the supplied id type is not in @SEQ_ID_TYPES. | |
255 | |
256 =cut | |
257 | |
258 sub preferred_id_type { | |
259 my ($self,$type) = @_; | |
260 if( defined $type ) { | |
261 if( ! grep lc($type) eq $_, @SEQ_ID_TYPES) { | |
262 $self->throw(-class=>'Bio::Root::BadParameter', | |
263 -text=>"Invalid ID type \"$type\". Must be one of: @SEQ_ID_TYPES"); | |
264 } | |
265 $self->{'_seq_id_type'} = lc($type); | |
266 # print STDERR "Setting preferred_id_type=$type\n"; | |
267 } | |
268 $self->{'_seq_id_type'} || $DEFAULT_SEQ_ID_TYPE; | |
269 } | |
270 | |
271 1; |