comparison variant_effect_predictor/Bio/Tools/ESTScan.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: ESTScan.pm,v 1.10 2002/10/22 07:38:45 lapp Exp $
2 #
3 # BioPerl module for Bio::Tools::ESTScan
4 #
5 # Cared for by Hilmar Lapp <hlapp@gmx.net>
6 #
7 # Copyright Hilmar Lapp
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Tools::ESTScan - Results of one ESTScan run
16
17 =head1 SYNOPSIS
18
19 $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan');
20 # filehandle:
21 $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT );
22
23 # parse the results
24 # note: this class is-a Bio::Tools::AnalysisResult which implements
25 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
26 while($gene = $estscan->next_prediction()) {
27 # $gene is an instance of Bio::Tools::Prediction::Gene
28 foreach my $orf ($gene->exons()) {
29 # $orf is an instance of Bio::Tools::Prediction::Exon
30 $cds_str = $orf->predicted_cds();
31 }
32 }
33
34 # essential if you gave a filename at initialization (otherwise the file
35 # will stay open)
36 $estscan->close();
37
38 =head1 DESCRIPTION
39
40 The ESTScan module provides a parser for ESTScan coding region prediction
41 output.
42
43 This module inherits off L<Bio::Tools::AnalysisResult> and therefore
44 implements the L<Bio::SeqAnalysisParserI> interface.
45 See L<Bio::SeqAnalysisParserI>.
46
47 =head1 FEEDBACK
48
49 =head2 Mailing Lists
50
51 User feedback is an integral part of the evolution of this and other
52 Bioperl modules. Send your comments and suggestions preferably to one
53 of the Bioperl mailing lists. Your participation is much appreciated.
54
55 bioperl-l@bioperl.org - General discussion
56 http://bio.perl.org/MailList.html - About the mailing lists
57
58 =head2 Reporting Bugs
59
60 Report bugs to the Bioperl bug tracking system to help us keep track
61 the bugs and their resolution. Bug reports can be submitted via email
62 or the web:
63
64 bioperl-bugs@bio.perl.org
65 http://bugzilla.bioperl.org/
66
67 =head1 AUTHOR - Hilmar Lapp
68
69 Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
70
71 Describe contact details here
72
73 =head1 APPENDIX
74
75 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
76
77 =cut
78
79 # Let the code begin...
80
81 package Bio::Tools::ESTScan;
82 use vars qw(@ISA);
83 use strict;
84 use Symbol;
85
86 use Bio::Root::Root;
87 use Bio::Tools::AnalysisResult;
88 use Bio::Tools::Prediction::Exon;
89
90 @ISA = qw(Bio::Tools::AnalysisResult);
91
92 sub _initialize_state {
93 my ($self,@args) = @_;
94
95 # first call the inherited method!
96 my $make = $self->SUPER::_initialize_state(@args);
97
98 if(! $self->analysis_method()) {
99 $self->analysis_method('ESTScan');
100 }
101 }
102
103 =head2 analysis_method
104
105 Usage : $estscan->analysis_method();
106 Purpose : Inherited method. Overridden to ensure that the name matches
107 /estscan/i.
108 Returns : String
109 Argument : n/a
110
111 =cut
112
113 #-------------
114 sub analysis_method {
115 #-------------
116 my ($self, $method) = @_;
117 if($method && ($method !~ /estscan/i)) {
118 $self->throw("method $method not supported in " . ref($self));
119 }
120 return $self->SUPER::analysis_method($method);
121 }
122
123 =head2 next_feature
124
125 Title : next_feature
126 Usage : while($orf = $estscan->next_feature()) {
127 # do something
128 }
129 Function: Returns the next gene structure prediction of the ESTScan result
130 file. Call this method repeatedly until FALSE is returned.
131
132 The returned object is actually a SeqFeatureI implementing object.
133 This method is required for classes implementing the
134 SeqAnalysisParserI interface, and is merely an alias for
135 next_prediction() at present.
136
137 Example :
138 Returns : A Bio::Tools::Prediction::Gene object.
139 Args :
140
141 =cut
142
143 sub next_feature {
144 my ($self,@args) = @_;
145 # even though next_prediction doesn't expect any args (and this method
146 # does neither), we pass on args in order to be prepared if this changes
147 # ever
148 return $self->next_prediction(@args);
149 }
150
151 =head2 next_prediction
152
153 Title : next_prediction
154 Usage : while($gene = $estscan->next_prediction()) {
155 # do something
156 }
157 Function: Returns the next gene structure prediction of the ESTScan result
158 file. Call this method repeatedly until FALSE is returned.
159
160 So far, this method DOES NOT work for reverse strand predictions,
161 even though the code looks like.
162 Example :
163 Returns : A Bio::Tools::Prediction::Gene object.
164 Args :
165
166 =cut
167
168 sub next_prediction {
169 my ($self) = @_;
170 my ($gene, $seq, $cds, $predobj);
171 my $numins = 0;
172
173 # predictions are in the format of FASTA sequences and can be parsed one
174 # at a time
175 $seq = $self->_fasta_stream()->next_seq();
176 return unless $seq;
177 # there is a new prediction
178 $gene = Bio::Tools::Prediction::Gene->new('-primary' => "ORFprediction",
179 '-source' => "ESTScan");
180 # score starts the description
181 $seq->desc() =~ /^([\d.]+)\s*(.*)/ or
182 $self->throw("unexpected format of description: no score in " .
183 $seq->desc());
184 $gene->score($1);
185 $seq->desc($2);
186 # strand may end the description
187 if($seq->desc() =~ /(.*)minus strand$/) {
188 my $desc = $1;
189 $desc =~ s/;\s+$//;
190 $seq->desc($desc);
191 $gene->strand(-1);
192 } else {
193 $gene->strand(1);
194 }
195 # check for the format: default or 'all-in-one' (option -a)
196 if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) {
197 # default format
198 $seq->desc($5);
199 $predobj = Bio::Tools::Prediction::Exon->new('-source' => "ESTScan",
200 '-start' => $3,
201 '-end' => $4);
202 $predobj->strand($gene->strand());
203 $predobj->score($gene->score()); # FIXME or $1, or $2 ?
204 $predobj->primary_tag("InternalExon");
205 $predobj->seq_id($seq->display_id());
206 # add to gene structure object
207 $gene->add_exon($predobj);
208 # add predicted CDS
209 $cds = $seq->seq();
210 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
211 $cds = Bio::PrimarySeq->new('-seq' => $cds,
212 '-display_id' => $seq->display_id(),
213 '-desc' => $seq->desc(),
214 '-alphabet' => "dna");
215 $gene->predicted_cds($cds);
216 $predobj->predicted_cds($cds);
217 if($gene->strand() == -1) {
218 $self->warn("reverse strand ORF, but unable to reverse coordinates!");
219 }
220 } else {
221 #
222 # All-in-one format (hopefully). This encodes the following information
223 # into the sequence:
224 # 1) untranslated regions: stretches of lower-case letters
225 # 2) translated regions: stretches of upper-case letters
226 # 3) insertions in the translated regions: capital X
227 # 4) deletions in the translated regions: a single lower-case letter
228 #
229 # if reverse strand ORF, save a lot of hassle by reversing the sequence
230 if($gene->strand() == -1) {
231 $seq = $seq->revcom();
232 }
233 my $seqstr = $seq->seq();
234 while($seqstr =~ /^([a-z]*)([A-Z].*)$/) {
235 # leading 5'UTR
236 my $utr5 = $1;
237 # exon + 3'UTR
238 my $exonseq = $2;
239 # strip 3'UTR and following exons
240 if($exonseq =~ s/([a-z]{2,}.*)$//) {
241 $seqstr = $1;
242 } else {
243 $seqstr = "";
244 }
245 # start: take care of yielding the absolute coordinate
246 my $start = CORE::length($utr5) + 1;
247 if($predobj) {
248 $start += $predobj->end() + $numins;
249 }
250 # for the end coordinate, we need to subtract the insertions
251 $cds = $exonseq;
252 $cds =~ s/[X]//g;
253 my $end = $start + CORE::length($cds) - 1;
254 # construct next exon object
255 $predobj = Bio::Tools::Prediction::Exon->new('-start' => $start,
256 '-end' => $end);
257 $predobj->source_tag("ESTScan");
258 $predobj->primary_tag("InternalExon");
259 $predobj->seq_id($seq->display_id());
260 $predobj->strand($gene->strand());
261 $predobj->score($gene->score());
262 # add the exon to the gene structure object
263 $gene->add_exon($predobj);
264 # add the predicted CDS
265 $cds = $exonseq;
266 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
267 $cds = Bio::PrimarySeq->new('-seq' => $cds,
268 '-display_id' => $seq->display_id(),
269 '-desc' => $seq->desc(),
270 '-alphabet' => "dna");
271 # only store the first one in the overall prediction
272 $gene->predicted_cds($cds) unless $gene->predicted_cds();
273 $predobj->predicted_cds($cds);
274 # add the predicted insertions and deletions as subfeatures
275 # of the exon
276 my $fea = undef;
277 while($exonseq =~ /([a-zX])/g) {
278 my $indel = $1;
279 # start and end: start looking at the position after the
280 # previous feature
281 if($fea) {
282 $start = $fea->start()+$numins;
283 $start -= 1 if($fea->primary_tag() eq 'insertion');
284 } else {
285 $start = $predobj->start()+$numins-1;
286 }
287 #print "# numins = $numins, indel = $indel, start = $start\n";
288 $start = index($seq->seq(), $indel, $start) + 1 - $numins;
289 $fea = Bio::SeqFeature::Generic->new('-start' => $start,
290 '-end' => $start);
291 $fea->source_tag("ESTScan");
292 $fea->seq_id($seq->display_id());
293 $fea->strand($predobj->strand());
294 if($indel eq 'X') {
295 # an insertion (depends on viewpoint: to get the 'real'
296 # CDS, a base has to be inserted, i.e., the HMMER model
297 # inserted a base; however, the sequencing process deleted
298 # a base that was there).
299 $fea->primary_tag("insertion");
300 # we need to count insertions because these are left out
301 # of any coordinates saved in the objects (which is correct
302 # because insertions change the original sequence, so
303 # coordinates wouldn't match)
304 $numins++;
305 } else {
306 # a deletion (depends on viewpoint: to get the 'real'
307 # CDS, a base has to be deleted, i.e., the HMMER model
308 # deleted a base; however, the sequencing process inserted
309 # a base that wasn't there).
310 $fea->primary_tag("deletion");
311 $fea->add_tag_value('base', $indel);
312 }
313 $predobj->add_sub_SeqFeature($fea);
314 }
315 }
316 }
317
318 return $gene;
319 }
320
321 =head2 close
322
323 Title : close
324 Usage : $result->close()
325 Function: Closes the file handle associated with this result file.
326 Inherited method, overridden.
327 Example :
328 Returns :
329 Args :
330
331 =cut
332
333 sub close {
334 my ($self, @args) = @_;
335
336 delete($self->{'_fastastream'});
337 $self->SUPER::close(@args);
338 }
339
340 =head2 _fasta_stream
341
342 Title : _fasta_stream
343 Usage : $result->_fasta_stream()
344 Function: Gets/Sets the FASTA sequence IO stream for reading the contents of
345 the file associated with this MZEF result object.
346
347 If called for the first time, creates the stream from the filehandle
348 if necessary.
349 Example :
350 Returns :
351 Args :
352
353 =cut
354
355 sub _fasta_stream {
356 my ($self, $stream) = @_;
357
358 if($stream || (! exists($self->{'_fastastream'}))) {
359 if(! $stream) {
360 $stream = Bio::SeqIO->new('-fh' => $self->_fh(),
361 '-format' => "fasta");
362 }
363 $self->{'_fastastream'} = $stream;
364 }
365 return $self->{'_fastastream'};
366 }
367
368 1;
369