0
|
1 # $Id: ESTScan.pm,v 1.10 2002/10/22 07:38:45 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Tools::ESTScan
|
|
4 #
|
|
5 # Cared for by Hilmar Lapp <hlapp@gmx.net>
|
|
6 #
|
|
7 # Copyright Hilmar Lapp
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Tools::ESTScan - Results of one ESTScan run
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan');
|
|
20 # filehandle:
|
|
21 $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT );
|
|
22
|
|
23 # parse the results
|
|
24 # note: this class is-a Bio::Tools::AnalysisResult which implements
|
|
25 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
|
|
26 while($gene = $estscan->next_prediction()) {
|
|
27 # $gene is an instance of Bio::Tools::Prediction::Gene
|
|
28 foreach my $orf ($gene->exons()) {
|
|
29 # $orf is an instance of Bio::Tools::Prediction::Exon
|
|
30 $cds_str = $orf->predicted_cds();
|
|
31 }
|
|
32 }
|
|
33
|
|
34 # essential if you gave a filename at initialization (otherwise the file
|
|
35 # will stay open)
|
|
36 $estscan->close();
|
|
37
|
|
38 =head1 DESCRIPTION
|
|
39
|
|
40 The ESTScan module provides a parser for ESTScan coding region prediction
|
|
41 output.
|
|
42
|
|
43 This module inherits off L<Bio::Tools::AnalysisResult> and therefore
|
|
44 implements the L<Bio::SeqAnalysisParserI> interface.
|
|
45 See L<Bio::SeqAnalysisParserI>.
|
|
46
|
|
47 =head1 FEEDBACK
|
|
48
|
|
49 =head2 Mailing Lists
|
|
50
|
|
51 User feedback is an integral part of the evolution of this and other
|
|
52 Bioperl modules. Send your comments and suggestions preferably to one
|
|
53 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
54
|
|
55 bioperl-l@bioperl.org - General discussion
|
|
56 http://bio.perl.org/MailList.html - About the mailing lists
|
|
57
|
|
58 =head2 Reporting Bugs
|
|
59
|
|
60 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
61 the bugs and their resolution. Bug reports can be submitted via email
|
|
62 or the web:
|
|
63
|
|
64 bioperl-bugs@bio.perl.org
|
|
65 http://bugzilla.bioperl.org/
|
|
66
|
|
67 =head1 AUTHOR - Hilmar Lapp
|
|
68
|
|
69 Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
|
|
70
|
|
71 Describe contact details here
|
|
72
|
|
73 =head1 APPENDIX
|
|
74
|
|
75 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
|
|
76
|
|
77 =cut
|
|
78
|
|
79 # Let the code begin...
|
|
80
|
|
81 package Bio::Tools::ESTScan;
|
|
82 use vars qw(@ISA);
|
|
83 use strict;
|
|
84 use Symbol;
|
|
85
|
|
86 use Bio::Root::Root;
|
|
87 use Bio::Tools::AnalysisResult;
|
|
88 use Bio::Tools::Prediction::Exon;
|
|
89
|
|
90 @ISA = qw(Bio::Tools::AnalysisResult);
|
|
91
|
|
92 sub _initialize_state {
|
|
93 my ($self,@args) = @_;
|
|
94
|
|
95 # first call the inherited method!
|
|
96 my $make = $self->SUPER::_initialize_state(@args);
|
|
97
|
|
98 if(! $self->analysis_method()) {
|
|
99 $self->analysis_method('ESTScan');
|
|
100 }
|
|
101 }
|
|
102
|
|
103 =head2 analysis_method
|
|
104
|
|
105 Usage : $estscan->analysis_method();
|
|
106 Purpose : Inherited method. Overridden to ensure that the name matches
|
|
107 /estscan/i.
|
|
108 Returns : String
|
|
109 Argument : n/a
|
|
110
|
|
111 =cut
|
|
112
|
|
113 #-------------
|
|
114 sub analysis_method {
|
|
115 #-------------
|
|
116 my ($self, $method) = @_;
|
|
117 if($method && ($method !~ /estscan/i)) {
|
|
118 $self->throw("method $method not supported in " . ref($self));
|
|
119 }
|
|
120 return $self->SUPER::analysis_method($method);
|
|
121 }
|
|
122
|
|
123 =head2 next_feature
|
|
124
|
|
125 Title : next_feature
|
|
126 Usage : while($orf = $estscan->next_feature()) {
|
|
127 # do something
|
|
128 }
|
|
129 Function: Returns the next gene structure prediction of the ESTScan result
|
|
130 file. Call this method repeatedly until FALSE is returned.
|
|
131
|
|
132 The returned object is actually a SeqFeatureI implementing object.
|
|
133 This method is required for classes implementing the
|
|
134 SeqAnalysisParserI interface, and is merely an alias for
|
|
135 next_prediction() at present.
|
|
136
|
|
137 Example :
|
|
138 Returns : A Bio::Tools::Prediction::Gene object.
|
|
139 Args :
|
|
140
|
|
141 =cut
|
|
142
|
|
143 sub next_feature {
|
|
144 my ($self,@args) = @_;
|
|
145 # even though next_prediction doesn't expect any args (and this method
|
|
146 # does neither), we pass on args in order to be prepared if this changes
|
|
147 # ever
|
|
148 return $self->next_prediction(@args);
|
|
149 }
|
|
150
|
|
151 =head2 next_prediction
|
|
152
|
|
153 Title : next_prediction
|
|
154 Usage : while($gene = $estscan->next_prediction()) {
|
|
155 # do something
|
|
156 }
|
|
157 Function: Returns the next gene structure prediction of the ESTScan result
|
|
158 file. Call this method repeatedly until FALSE is returned.
|
|
159
|
|
160 So far, this method DOES NOT work for reverse strand predictions,
|
|
161 even though the code looks like.
|
|
162 Example :
|
|
163 Returns : A Bio::Tools::Prediction::Gene object.
|
|
164 Args :
|
|
165
|
|
166 =cut
|
|
167
|
|
168 sub next_prediction {
|
|
169 my ($self) = @_;
|
|
170 my ($gene, $seq, $cds, $predobj);
|
|
171 my $numins = 0;
|
|
172
|
|
173 # predictions are in the format of FASTA sequences and can be parsed one
|
|
174 # at a time
|
|
175 $seq = $self->_fasta_stream()->next_seq();
|
|
176 return unless $seq;
|
|
177 # there is a new prediction
|
|
178 $gene = Bio::Tools::Prediction::Gene->new('-primary' => "ORFprediction",
|
|
179 '-source' => "ESTScan");
|
|
180 # score starts the description
|
|
181 $seq->desc() =~ /^([\d.]+)\s*(.*)/ or
|
|
182 $self->throw("unexpected format of description: no score in " .
|
|
183 $seq->desc());
|
|
184 $gene->score($1);
|
|
185 $seq->desc($2);
|
|
186 # strand may end the description
|
|
187 if($seq->desc() =~ /(.*)minus strand$/) {
|
|
188 my $desc = $1;
|
|
189 $desc =~ s/;\s+$//;
|
|
190 $seq->desc($desc);
|
|
191 $gene->strand(-1);
|
|
192 } else {
|
|
193 $gene->strand(1);
|
|
194 }
|
|
195 # check for the format: default or 'all-in-one' (option -a)
|
|
196 if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) {
|
|
197 # default format
|
|
198 $seq->desc($5);
|
|
199 $predobj = Bio::Tools::Prediction::Exon->new('-source' => "ESTScan",
|
|
200 '-start' => $3,
|
|
201 '-end' => $4);
|
|
202 $predobj->strand($gene->strand());
|
|
203 $predobj->score($gene->score()); # FIXME or $1, or $2 ?
|
|
204 $predobj->primary_tag("InternalExon");
|
|
205 $predobj->seq_id($seq->display_id());
|
|
206 # add to gene structure object
|
|
207 $gene->add_exon($predobj);
|
|
208 # add predicted CDS
|
|
209 $cds = $seq->seq();
|
|
210 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
|
|
211 $cds = Bio::PrimarySeq->new('-seq' => $cds,
|
|
212 '-display_id' => $seq->display_id(),
|
|
213 '-desc' => $seq->desc(),
|
|
214 '-alphabet' => "dna");
|
|
215 $gene->predicted_cds($cds);
|
|
216 $predobj->predicted_cds($cds);
|
|
217 if($gene->strand() == -1) {
|
|
218 $self->warn("reverse strand ORF, but unable to reverse coordinates!");
|
|
219 }
|
|
220 } else {
|
|
221 #
|
|
222 # All-in-one format (hopefully). This encodes the following information
|
|
223 # into the sequence:
|
|
224 # 1) untranslated regions: stretches of lower-case letters
|
|
225 # 2) translated regions: stretches of upper-case letters
|
|
226 # 3) insertions in the translated regions: capital X
|
|
227 # 4) deletions in the translated regions: a single lower-case letter
|
|
228 #
|
|
229 # if reverse strand ORF, save a lot of hassle by reversing the sequence
|
|
230 if($gene->strand() == -1) {
|
|
231 $seq = $seq->revcom();
|
|
232 }
|
|
233 my $seqstr = $seq->seq();
|
|
234 while($seqstr =~ /^([a-z]*)([A-Z].*)$/) {
|
|
235 # leading 5'UTR
|
|
236 my $utr5 = $1;
|
|
237 # exon + 3'UTR
|
|
238 my $exonseq = $2;
|
|
239 # strip 3'UTR and following exons
|
|
240 if($exonseq =~ s/([a-z]{2,}.*)$//) {
|
|
241 $seqstr = $1;
|
|
242 } else {
|
|
243 $seqstr = "";
|
|
244 }
|
|
245 # start: take care of yielding the absolute coordinate
|
|
246 my $start = CORE::length($utr5) + 1;
|
|
247 if($predobj) {
|
|
248 $start += $predobj->end() + $numins;
|
|
249 }
|
|
250 # for the end coordinate, we need to subtract the insertions
|
|
251 $cds = $exonseq;
|
|
252 $cds =~ s/[X]//g;
|
|
253 my $end = $start + CORE::length($cds) - 1;
|
|
254 # construct next exon object
|
|
255 $predobj = Bio::Tools::Prediction::Exon->new('-start' => $start,
|
|
256 '-end' => $end);
|
|
257 $predobj->source_tag("ESTScan");
|
|
258 $predobj->primary_tag("InternalExon");
|
|
259 $predobj->seq_id($seq->display_id());
|
|
260 $predobj->strand($gene->strand());
|
|
261 $predobj->score($gene->score());
|
|
262 # add the exon to the gene structure object
|
|
263 $gene->add_exon($predobj);
|
|
264 # add the predicted CDS
|
|
265 $cds = $exonseq;
|
|
266 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
|
|
267 $cds = Bio::PrimarySeq->new('-seq' => $cds,
|
|
268 '-display_id' => $seq->display_id(),
|
|
269 '-desc' => $seq->desc(),
|
|
270 '-alphabet' => "dna");
|
|
271 # only store the first one in the overall prediction
|
|
272 $gene->predicted_cds($cds) unless $gene->predicted_cds();
|
|
273 $predobj->predicted_cds($cds);
|
|
274 # add the predicted insertions and deletions as subfeatures
|
|
275 # of the exon
|
|
276 my $fea = undef;
|
|
277 while($exonseq =~ /([a-zX])/g) {
|
|
278 my $indel = $1;
|
|
279 # start and end: start looking at the position after the
|
|
280 # previous feature
|
|
281 if($fea) {
|
|
282 $start = $fea->start()+$numins;
|
|
283 $start -= 1 if($fea->primary_tag() eq 'insertion');
|
|
284 } else {
|
|
285 $start = $predobj->start()+$numins-1;
|
|
286 }
|
|
287 #print "# numins = $numins, indel = $indel, start = $start\n";
|
|
288 $start = index($seq->seq(), $indel, $start) + 1 - $numins;
|
|
289 $fea = Bio::SeqFeature::Generic->new('-start' => $start,
|
|
290 '-end' => $start);
|
|
291 $fea->source_tag("ESTScan");
|
|
292 $fea->seq_id($seq->display_id());
|
|
293 $fea->strand($predobj->strand());
|
|
294 if($indel eq 'X') {
|
|
295 # an insertion (depends on viewpoint: to get the 'real'
|
|
296 # CDS, a base has to be inserted, i.e., the HMMER model
|
|
297 # inserted a base; however, the sequencing process deleted
|
|
298 # a base that was there).
|
|
299 $fea->primary_tag("insertion");
|
|
300 # we need to count insertions because these are left out
|
|
301 # of any coordinates saved in the objects (which is correct
|
|
302 # because insertions change the original sequence, so
|
|
303 # coordinates wouldn't match)
|
|
304 $numins++;
|
|
305 } else {
|
|
306 # a deletion (depends on viewpoint: to get the 'real'
|
|
307 # CDS, a base has to be deleted, i.e., the HMMER model
|
|
308 # deleted a base; however, the sequencing process inserted
|
|
309 # a base that wasn't there).
|
|
310 $fea->primary_tag("deletion");
|
|
311 $fea->add_tag_value('base', $indel);
|
|
312 }
|
|
313 $predobj->add_sub_SeqFeature($fea);
|
|
314 }
|
|
315 }
|
|
316 }
|
|
317
|
|
318 return $gene;
|
|
319 }
|
|
320
|
|
321 =head2 close
|
|
322
|
|
323 Title : close
|
|
324 Usage : $result->close()
|
|
325 Function: Closes the file handle associated with this result file.
|
|
326 Inherited method, overridden.
|
|
327 Example :
|
|
328 Returns :
|
|
329 Args :
|
|
330
|
|
331 =cut
|
|
332
|
|
333 sub close {
|
|
334 my ($self, @args) = @_;
|
|
335
|
|
336 delete($self->{'_fastastream'});
|
|
337 $self->SUPER::close(@args);
|
|
338 }
|
|
339
|
|
340 =head2 _fasta_stream
|
|
341
|
|
342 Title : _fasta_stream
|
|
343 Usage : $result->_fasta_stream()
|
|
344 Function: Gets/Sets the FASTA sequence IO stream for reading the contents of
|
|
345 the file associated with this MZEF result object.
|
|
346
|
|
347 If called for the first time, creates the stream from the filehandle
|
|
348 if necessary.
|
|
349 Example :
|
|
350 Returns :
|
|
351 Args :
|
|
352
|
|
353 =cut
|
|
354
|
|
355 sub _fasta_stream {
|
|
356 my ($self, $stream) = @_;
|
|
357
|
|
358 if($stream || (! exists($self->{'_fastastream'}))) {
|
|
359 if(! $stream) {
|
|
360 $stream = Bio::SeqIO->new('-fh' => $self->_fh(),
|
|
361 '-format' => "fasta");
|
|
362 }
|
|
363 $self->{'_fastastream'} = $stream;
|
|
364 }
|
|
365 return $self->{'_fastastream'};
|
|
366 }
|
|
367
|
|
368 1;
|
|
369
|