Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/ESTScan.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: ESTScan.pm,v 1.10 2002/10/22 07:38:45 lapp Exp $ | |
2 # | |
3 # BioPerl module for Bio::Tools::ESTScan | |
4 # | |
5 # Cared for by Hilmar Lapp <hlapp@gmx.net> | |
6 # | |
7 # Copyright Hilmar Lapp | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Tools::ESTScan - Results of one ESTScan run | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan'); | |
20 # filehandle: | |
21 $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT ); | |
22 | |
23 # parse the results | |
24 # note: this class is-a Bio::Tools::AnalysisResult which implements | |
25 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same | |
26 while($gene = $estscan->next_prediction()) { | |
27 # $gene is an instance of Bio::Tools::Prediction::Gene | |
28 foreach my $orf ($gene->exons()) { | |
29 # $orf is an instance of Bio::Tools::Prediction::Exon | |
30 $cds_str = $orf->predicted_cds(); | |
31 } | |
32 } | |
33 | |
34 # essential if you gave a filename at initialization (otherwise the file | |
35 # will stay open) | |
36 $estscan->close(); | |
37 | |
38 =head1 DESCRIPTION | |
39 | |
40 The ESTScan module provides a parser for ESTScan coding region prediction | |
41 output. | |
42 | |
43 This module inherits off L<Bio::Tools::AnalysisResult> and therefore | |
44 implements the L<Bio::SeqAnalysisParserI> interface. | |
45 See L<Bio::SeqAnalysisParserI>. | |
46 | |
47 =head1 FEEDBACK | |
48 | |
49 =head2 Mailing Lists | |
50 | |
51 User feedback is an integral part of the evolution of this and other | |
52 Bioperl modules. Send your comments and suggestions preferably to one | |
53 of the Bioperl mailing lists. Your participation is much appreciated. | |
54 | |
55 bioperl-l@bioperl.org - General discussion | |
56 http://bio.perl.org/MailList.html - About the mailing lists | |
57 | |
58 =head2 Reporting Bugs | |
59 | |
60 Report bugs to the Bioperl bug tracking system to help us keep track | |
61 the bugs and their resolution. Bug reports can be submitted via email | |
62 or the web: | |
63 | |
64 bioperl-bugs@bio.perl.org | |
65 http://bugzilla.bioperl.org/ | |
66 | |
67 =head1 AUTHOR - Hilmar Lapp | |
68 | |
69 Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com) | |
70 | |
71 Describe contact details here | |
72 | |
73 =head1 APPENDIX | |
74 | |
75 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
76 | |
77 =cut | |
78 | |
79 # Let the code begin... | |
80 | |
81 package Bio::Tools::ESTScan; | |
82 use vars qw(@ISA); | |
83 use strict; | |
84 use Symbol; | |
85 | |
86 use Bio::Root::Root; | |
87 use Bio::Tools::AnalysisResult; | |
88 use Bio::Tools::Prediction::Exon; | |
89 | |
90 @ISA = qw(Bio::Tools::AnalysisResult); | |
91 | |
92 sub _initialize_state { | |
93 my ($self,@args) = @_; | |
94 | |
95 # first call the inherited method! | |
96 my $make = $self->SUPER::_initialize_state(@args); | |
97 | |
98 if(! $self->analysis_method()) { | |
99 $self->analysis_method('ESTScan'); | |
100 } | |
101 } | |
102 | |
103 =head2 analysis_method | |
104 | |
105 Usage : $estscan->analysis_method(); | |
106 Purpose : Inherited method. Overridden to ensure that the name matches | |
107 /estscan/i. | |
108 Returns : String | |
109 Argument : n/a | |
110 | |
111 =cut | |
112 | |
113 #------------- | |
114 sub analysis_method { | |
115 #------------- | |
116 my ($self, $method) = @_; | |
117 if($method && ($method !~ /estscan/i)) { | |
118 $self->throw("method $method not supported in " . ref($self)); | |
119 } | |
120 return $self->SUPER::analysis_method($method); | |
121 } | |
122 | |
123 =head2 next_feature | |
124 | |
125 Title : next_feature | |
126 Usage : while($orf = $estscan->next_feature()) { | |
127 # do something | |
128 } | |
129 Function: Returns the next gene structure prediction of the ESTScan result | |
130 file. Call this method repeatedly until FALSE is returned. | |
131 | |
132 The returned object is actually a SeqFeatureI implementing object. | |
133 This method is required for classes implementing the | |
134 SeqAnalysisParserI interface, and is merely an alias for | |
135 next_prediction() at present. | |
136 | |
137 Example : | |
138 Returns : A Bio::Tools::Prediction::Gene object. | |
139 Args : | |
140 | |
141 =cut | |
142 | |
143 sub next_feature { | |
144 my ($self,@args) = @_; | |
145 # even though next_prediction doesn't expect any args (and this method | |
146 # does neither), we pass on args in order to be prepared if this changes | |
147 # ever | |
148 return $self->next_prediction(@args); | |
149 } | |
150 | |
151 =head2 next_prediction | |
152 | |
153 Title : next_prediction | |
154 Usage : while($gene = $estscan->next_prediction()) { | |
155 # do something | |
156 } | |
157 Function: Returns the next gene structure prediction of the ESTScan result | |
158 file. Call this method repeatedly until FALSE is returned. | |
159 | |
160 So far, this method DOES NOT work for reverse strand predictions, | |
161 even though the code looks like. | |
162 Example : | |
163 Returns : A Bio::Tools::Prediction::Gene object. | |
164 Args : | |
165 | |
166 =cut | |
167 | |
168 sub next_prediction { | |
169 my ($self) = @_; | |
170 my ($gene, $seq, $cds, $predobj); | |
171 my $numins = 0; | |
172 | |
173 # predictions are in the format of FASTA sequences and can be parsed one | |
174 # at a time | |
175 $seq = $self->_fasta_stream()->next_seq(); | |
176 return unless $seq; | |
177 # there is a new prediction | |
178 $gene = Bio::Tools::Prediction::Gene->new('-primary' => "ORFprediction", | |
179 '-source' => "ESTScan"); | |
180 # score starts the description | |
181 $seq->desc() =~ /^([\d.]+)\s*(.*)/ or | |
182 $self->throw("unexpected format of description: no score in " . | |
183 $seq->desc()); | |
184 $gene->score($1); | |
185 $seq->desc($2); | |
186 # strand may end the description | |
187 if($seq->desc() =~ /(.*)minus strand$/) { | |
188 my $desc = $1; | |
189 $desc =~ s/;\s+$//; | |
190 $seq->desc($desc); | |
191 $gene->strand(-1); | |
192 } else { | |
193 $gene->strand(1); | |
194 } | |
195 # check for the format: default or 'all-in-one' (option -a) | |
196 if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) { | |
197 # default format | |
198 $seq->desc($5); | |
199 $predobj = Bio::Tools::Prediction::Exon->new('-source' => "ESTScan", | |
200 '-start' => $3, | |
201 '-end' => $4); | |
202 $predobj->strand($gene->strand()); | |
203 $predobj->score($gene->score()); # FIXME or $1, or $2 ? | |
204 $predobj->primary_tag("InternalExon"); | |
205 $predobj->seq_id($seq->display_id()); | |
206 # add to gene structure object | |
207 $gene->add_exon($predobj); | |
208 # add predicted CDS | |
209 $cds = $seq->seq(); | |
210 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions | |
211 $cds = Bio::PrimarySeq->new('-seq' => $cds, | |
212 '-display_id' => $seq->display_id(), | |
213 '-desc' => $seq->desc(), | |
214 '-alphabet' => "dna"); | |
215 $gene->predicted_cds($cds); | |
216 $predobj->predicted_cds($cds); | |
217 if($gene->strand() == -1) { | |
218 $self->warn("reverse strand ORF, but unable to reverse coordinates!"); | |
219 } | |
220 } else { | |
221 # | |
222 # All-in-one format (hopefully). This encodes the following information | |
223 # into the sequence: | |
224 # 1) untranslated regions: stretches of lower-case letters | |
225 # 2) translated regions: stretches of upper-case letters | |
226 # 3) insertions in the translated regions: capital X | |
227 # 4) deletions in the translated regions: a single lower-case letter | |
228 # | |
229 # if reverse strand ORF, save a lot of hassle by reversing the sequence | |
230 if($gene->strand() == -1) { | |
231 $seq = $seq->revcom(); | |
232 } | |
233 my $seqstr = $seq->seq(); | |
234 while($seqstr =~ /^([a-z]*)([A-Z].*)$/) { | |
235 # leading 5'UTR | |
236 my $utr5 = $1; | |
237 # exon + 3'UTR | |
238 my $exonseq = $2; | |
239 # strip 3'UTR and following exons | |
240 if($exonseq =~ s/([a-z]{2,}.*)$//) { | |
241 $seqstr = $1; | |
242 } else { | |
243 $seqstr = ""; | |
244 } | |
245 # start: take care of yielding the absolute coordinate | |
246 my $start = CORE::length($utr5) + 1; | |
247 if($predobj) { | |
248 $start += $predobj->end() + $numins; | |
249 } | |
250 # for the end coordinate, we need to subtract the insertions | |
251 $cds = $exonseq; | |
252 $cds =~ s/[X]//g; | |
253 my $end = $start + CORE::length($cds) - 1; | |
254 # construct next exon object | |
255 $predobj = Bio::Tools::Prediction::Exon->new('-start' => $start, | |
256 '-end' => $end); | |
257 $predobj->source_tag("ESTScan"); | |
258 $predobj->primary_tag("InternalExon"); | |
259 $predobj->seq_id($seq->display_id()); | |
260 $predobj->strand($gene->strand()); | |
261 $predobj->score($gene->score()); | |
262 # add the exon to the gene structure object | |
263 $gene->add_exon($predobj); | |
264 # add the predicted CDS | |
265 $cds = $exonseq; | |
266 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions | |
267 $cds = Bio::PrimarySeq->new('-seq' => $cds, | |
268 '-display_id' => $seq->display_id(), | |
269 '-desc' => $seq->desc(), | |
270 '-alphabet' => "dna"); | |
271 # only store the first one in the overall prediction | |
272 $gene->predicted_cds($cds) unless $gene->predicted_cds(); | |
273 $predobj->predicted_cds($cds); | |
274 # add the predicted insertions and deletions as subfeatures | |
275 # of the exon | |
276 my $fea = undef; | |
277 while($exonseq =~ /([a-zX])/g) { | |
278 my $indel = $1; | |
279 # start and end: start looking at the position after the | |
280 # previous feature | |
281 if($fea) { | |
282 $start = $fea->start()+$numins; | |
283 $start -= 1 if($fea->primary_tag() eq 'insertion'); | |
284 } else { | |
285 $start = $predobj->start()+$numins-1; | |
286 } | |
287 #print "# numins = $numins, indel = $indel, start = $start\n"; | |
288 $start = index($seq->seq(), $indel, $start) + 1 - $numins; | |
289 $fea = Bio::SeqFeature::Generic->new('-start' => $start, | |
290 '-end' => $start); | |
291 $fea->source_tag("ESTScan"); | |
292 $fea->seq_id($seq->display_id()); | |
293 $fea->strand($predobj->strand()); | |
294 if($indel eq 'X') { | |
295 # an insertion (depends on viewpoint: to get the 'real' | |
296 # CDS, a base has to be inserted, i.e., the HMMER model | |
297 # inserted a base; however, the sequencing process deleted | |
298 # a base that was there). | |
299 $fea->primary_tag("insertion"); | |
300 # we need to count insertions because these are left out | |
301 # of any coordinates saved in the objects (which is correct | |
302 # because insertions change the original sequence, so | |
303 # coordinates wouldn't match) | |
304 $numins++; | |
305 } else { | |
306 # a deletion (depends on viewpoint: to get the 'real' | |
307 # CDS, a base has to be deleted, i.e., the HMMER model | |
308 # deleted a base; however, the sequencing process inserted | |
309 # a base that wasn't there). | |
310 $fea->primary_tag("deletion"); | |
311 $fea->add_tag_value('base', $indel); | |
312 } | |
313 $predobj->add_sub_SeqFeature($fea); | |
314 } | |
315 } | |
316 } | |
317 | |
318 return $gene; | |
319 } | |
320 | |
321 =head2 close | |
322 | |
323 Title : close | |
324 Usage : $result->close() | |
325 Function: Closes the file handle associated with this result file. | |
326 Inherited method, overridden. | |
327 Example : | |
328 Returns : | |
329 Args : | |
330 | |
331 =cut | |
332 | |
333 sub close { | |
334 my ($self, @args) = @_; | |
335 | |
336 delete($self->{'_fastastream'}); | |
337 $self->SUPER::close(@args); | |
338 } | |
339 | |
340 =head2 _fasta_stream | |
341 | |
342 Title : _fasta_stream | |
343 Usage : $result->_fasta_stream() | |
344 Function: Gets/Sets the FASTA sequence IO stream for reading the contents of | |
345 the file associated with this MZEF result object. | |
346 | |
347 If called for the first time, creates the stream from the filehandle | |
348 if necessary. | |
349 Example : | |
350 Returns : | |
351 Args : | |
352 | |
353 =cut | |
354 | |
355 sub _fasta_stream { | |
356 my ($self, $stream) = @_; | |
357 | |
358 if($stream || (! exists($self->{'_fastastream'}))) { | |
359 if(! $stream) { | |
360 $stream = Bio::SeqIO->new('-fh' => $self->_fh(), | |
361 '-format' => "fasta"); | |
362 } | |
363 $self->{'_fastastream'} = $stream; | |
364 } | |
365 return $self->{'_fastastream'}; | |
366 } | |
367 | |
368 1; | |
369 |