Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/Tools/Sim4/Results.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 | |
2 # | |
3 # BioPerl module for Bio::Tools::Sim4::Results | |
4 # | |
5 # Cared for by Ewan Birney <birney@sanger.ac.uk> | |
6 # and Hilmar Lapp <hlapp@gmx.net> | |
7 # | |
8 # Copyright Ewan Birney and Hilmar Lapp | |
9 # | |
10 # You may distribute this module under the same terms as perl itself | |
11 | |
12 # POD documentation - main docs before the code | |
13 | |
14 =head1 NAME | |
15 | |
16 Bio::Tools::Sim4::Results - Results of one Sim4 run | |
17 | |
18 =head1 SYNOPSIS | |
19 | |
20 # to preset the order of EST and genomic file as given on the sim4 | |
21 # command line: | |
22 my $sim4 = Bio::Tools::Sim4::Results->new(-file => 'result.sim4', | |
23 -estfirst => 1); | |
24 # to let the order be determined automatically (by length comparison): | |
25 $sim4 = Bio::Tools::Sim4::Results->new( -file => 'sim4.results' ); | |
26 # filehandle: | |
27 $sim4 = Bio::Tools::Sim4::Results->new( -fh => \*INPUT ); | |
28 | |
29 # parse the results | |
30 while(my $exonset = $sim4->next_exonset()) { | |
31 # $exonset is-a Bio::SeqFeature::Generic with Bio::Tools::Sim4::Exons | |
32 # as sub features | |
33 print "Delimited on sequence ", $exonset->seq_id(), | |
34 "from ", $exonset->start(), " to ", $exonset->end(), "\n"; | |
35 foreach my $exon ( $exonset->sub_SeqFeature() ) { | |
36 # $exon is-a Bio::SeqFeature::FeaturePair | |
37 print "Exon from ", $exon->start, " to ", $exon->end, | |
38 " on strand ", $exon->strand(), "\n"; | |
39 # you can get out what it matched using the est_hit attribute | |
40 my $homol = $exon->est_hit(); | |
41 print "Matched to sequence ", $homol->seq_id, | |
42 " at ", $homol->start," to ", $homol->end, "\n"; | |
43 } | |
44 } | |
45 | |
46 # essential if you gave a filename at initialization (otherwise the file | |
47 # stays open) | |
48 $sim4->close(); | |
49 | |
50 =head1 DESCRIPTION | |
51 | |
52 The sim4 module provides a parser and results object for sim4 output. The | |
53 sim4 results are specialised types of SeqFeatures, meaning you can add them | |
54 to AnnSeq objects fine, and manipulate them in the "normal" seqfeature manner. | |
55 | |
56 The sim4 Exon objects are Bio::SeqFeature::FeaturePair inherited objects. The | |
57 $esthit = $exon-E<gt>est_hit() is the alignment as a feature on the matching | |
58 object (normally, an EST), in which the start/end points are where the hit | |
59 lies. | |
60 | |
61 To make this module work sensibly you need to run | |
62 | |
63 sim4 genomic.fasta est.database.fasta | |
64 or | |
65 sim4 est.fasta genomic.database.fasta | |
66 | |
67 To get the sequence identifiers recorded for the first sequence, too, use | |
68 A=4 as output option for sim4. | |
69 | |
70 One fiddle here is that there are only two real possibilities to the matching | |
71 criteria: either one sequence needs reversing or not. Because of this, it | |
72 is impossible to tell whether the match is in the forward or reverse strand | |
73 of the genomic DNA. We solve this here by assuming that the genomic DNA is | |
74 always forward. As a consequence, the strand attribute of the matching EST is | |
75 unknown, and the strand attribute of the genomic DNA (i.e., the Exon object) | |
76 will reflect the direction of the hit. | |
77 | |
78 See the documentation of parse_next_alignment() for abilities of the parser | |
79 to deal with the different output format options of sim4. | |
80 | |
81 =head1 FEEDBACK | |
82 | |
83 =head2 Mailing Lists | |
84 | |
85 User feedback is an integral part of the evolution of this and other | |
86 Bioperl modules. Send your comments and suggestions preferably to one | |
87 of the Bioperl mailing lists. Your participation is much appreciated. | |
88 | |
89 bioperl-l@bioperl.org - General discussion | |
90 http://bio.perl.org/MailList.html - About the mailing lists | |
91 | |
92 =head2 Reporting Bugs | |
93 | |
94 Report bugs to the Bioperl bug tracking system to help us keep track | |
95 the bugs and their resolution. Bug reports can be submitted via email | |
96 or the web: | |
97 | |
98 bioperl-bugs@bio.perl.org | |
99 http://bugzilla.bioperl.org/ | |
100 | |
101 =head1 AUTHOR - Ewan Birney, Hilmar Lapp | |
102 | |
103 Email birney@sanger.ac.uk | |
104 hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com) | |
105 | |
106 Describe contact details here | |
107 | |
108 =head1 APPENDIX | |
109 | |
110 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
111 | |
112 =cut | |
113 | |
114 | |
115 # Let the code begin... | |
116 | |
117 | |
118 package Bio::Tools::Sim4::Results; | |
119 use vars qw(@ISA); | |
120 use strict; | |
121 | |
122 # Object preamble - inherits from Bio::Root::Object | |
123 | |
124 use File::Basename; | |
125 use Bio::Root::Root; | |
126 use Bio::Tools::AnalysisResult; | |
127 use Bio::Tools::Sim4::Exon; | |
128 | |
129 @ISA = qw(Bio::Tools::AnalysisResult); | |
130 | |
131 | |
132 sub _initialize_state { | |
133 my($self,@args) = @_; | |
134 | |
135 # call the inherited method first | |
136 my $make = $self->SUPER::_initialize_state(@args); | |
137 | |
138 my ($est_is_first) = $self->_rearrange([qw(ESTFIRST)], @args); | |
139 | |
140 delete($self->{'_est_is_first'}); | |
141 $self->{'_est_is_first'} = $est_is_first if(defined($est_is_first)); | |
142 $self->analysis_method("Sim4"); | |
143 } | |
144 | |
145 =head2 analysis_method | |
146 | |
147 Usage : $sim4->analysis_method(); | |
148 Purpose : Inherited method. Overridden to ensure that the name matches | |
149 /sim4/i. | |
150 Returns : String | |
151 Argument : n/a | |
152 | |
153 =cut | |
154 | |
155 #------------- | |
156 sub analysis_method { | |
157 #------------- | |
158 my ($self, $method) = @_; | |
159 if($method && ($method !~ /sim4/i)) { | |
160 $self->throw("method $method not supported in " . ref($self)); | |
161 } | |
162 return $self->SUPER::analysis_method($method); | |
163 } | |
164 | |
165 =head2 parse_next_alignment | |
166 | |
167 Title : parse_next_alignment | |
168 Usage : @exons = $sim4_result->parse_next_alignment; | |
169 foreach $exon (@exons) { | |
170 # do something | |
171 } | |
172 Function: Parses the next alignment of the Sim4 result file and returns the | |
173 found exons as an array of Bio::Tools::Sim4::Exon objects. Call | |
174 this method repeatedly until an empty array is returned to get the | |
175 results for all alignments. | |
176 | |
177 The $exon->seq_id() attribute will be set to the identifier of the | |
178 respective sequence for both sequences if A=4 was used in the sim4 | |
179 run, and otherwise for the second sequence only. If the output does | |
180 not contain the identifier, the filename stripped of path and | |
181 extension is used instead. In addition, the full filename | |
182 will be recorded for both features ($exon inherits off | |
183 Bio::SeqFeature::SimilarityPair) as tag 'filename'. The length | |
184 is accessible via the seqlength() attribute of $exon->query() and | |
185 $exon->est_hit(). | |
186 | |
187 Note that this method is capable of dealing with outputs generated | |
188 with format 0,1,3, and 4 (via the A=n option to sim4). It | |
189 automatically determines which of the two sequences has been | |
190 reversed, and adjusts the coordinates for that sequence. It will | |
191 also detect whether the EST sequence(s) were given as first or as | |
192 second file to sim4, unless this has been specified at creation | |
193 time of the object. | |
194 | |
195 Example : | |
196 Returns : An array of Bio::Tools::Sim4::Exon objects | |
197 Args : | |
198 | |
199 | |
200 =cut | |
201 | |
202 sub parse_next_alignment { | |
203 my ($self) = @_; | |
204 my @exons = (); | |
205 my %seq1props = (); | |
206 my %seq2props = (); | |
207 # we refer to the properties of each seq by reference | |
208 my ($estseq, $genomseq, $to_reverse); | |
209 my $started = 0; | |
210 my $hit_direction = 1; | |
211 my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced | |
212 # output yet) | |
213 | |
214 while(defined($_ = $self->_readline())) { | |
215 #chomp(); | |
216 | |
217 # | |
218 # bascially, each sim4 'hit' starts with seq1... | |
219 # | |
220 /^seq1/ && do { | |
221 if($started) { | |
222 $self->_pushback($_); | |
223 last; | |
224 } | |
225 $started = 1; | |
226 | |
227 # filename and length of seq 1 | |
228 /^seq1\s+=\s+(\S+)\,\s+(\d+)/ || | |
229 $self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!"); | |
230 $seq1props{'filename'} = $1; | |
231 $seq1props{'length'} = $2; | |
232 next; | |
233 }; | |
234 /^seq2/ && do { | |
235 # the second hit has also the database name in the >name syntax | |
236 # (in brackets). | |
237 /^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/|| | |
238 $self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!"); | |
239 $seq2props{'filename'} = $1; | |
240 $seq2props{'seqname'} = $2; | |
241 $seq2props{'length'} = $3; | |
242 next; | |
243 }; | |
244 if(/^>(\S+)\s*(.*)$/) { | |
245 # output option was A=4, which not only gives the complete | |
246 # description lines, but also causes the longer sequence to be | |
247 # reversed if the second file contained one (genomic) sequence | |
248 $seq1props{'seqname'} = $1; | |
249 $seq1props{'description'} = $2 if $2; | |
250 $output_fmt = 4; | |
251 # we handle seq1 and seq2 both here | |
252 if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) { | |
253 $seq2props{'seqname'} = $1; # redundant, since already set above | |
254 $seq2props{'description'} = $2 if $2; | |
255 } | |
256 next; | |
257 } | |
258 /^\(complement\)/ && do { | |
259 $hit_direction = -1; | |
260 next; | |
261 }; | |
262 # this matches | |
263 # start-end (start-end) pctid% | |
264 if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) { | |
265 $seq1props{'start'} = $1; | |
266 $seq1props{'end'} = $2; | |
267 $seq2props{'start'} = $3; | |
268 $seq2props{'end'} = $4; | |
269 my $pctid = $5; | |
270 | |
271 if(! defined($estseq)) { | |
272 # for the first time here: need to set the references referring | |
273 # to seq1 and seq2 | |
274 if(! exists($self->{'_est_is_first'})) { | |
275 # detect which one is the EST by looking at the lengths, | |
276 # and assume that this holds throughout the entire result | |
277 # file (i.e., when this method is called for the next | |
278 # alignment, this will not be checked again) | |
279 if($seq1props{'length'} > $seq2props{'length'}) { | |
280 $self->{'_est_is_first'} = 0; | |
281 } else { | |
282 $self->{'_est_is_first'} = 1; | |
283 } | |
284 } | |
285 if($self->{'_est_is_first'}) { | |
286 $estseq = \%seq1props; | |
287 $genomseq = \%seq2props; | |
288 # if the EST is given first, A=4 selects the genomic | |
289 # seq for being reversed (reversing the EST is default) | |
290 $to_reverse = ($output_fmt == 4) ? $genomseq : $estseq; | |
291 } else { | |
292 $estseq = \%seq2props; | |
293 $genomseq = \%seq1props; | |
294 # if the EST is the second, A=4 does not change the | |
295 # seq being reversed (always the EST is reversed) | |
296 $to_reverse = $estseq; | |
297 } | |
298 } | |
299 if($hit_direction == -1) { | |
300 # we have to reverse the coordinates of one of both seqs | |
301 my $tmp = $to_reverse->{'start'}; | |
302 $to_reverse->{'start'} = | |
303 $to_reverse->{'length'} - $to_reverse->{'end'} + 1; | |
304 $to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1; | |
305 } | |
306 # create and initialize the exon object | |
307 my $exon = Bio::Tools::Sim4::Exon->new( | |
308 '-start' => $genomseq->{'start'}, | |
309 '-end' => $genomseq->{'end'}, | |
310 '-strand' => $hit_direction); | |
311 if(exists($genomseq->{'seqname'})) { | |
312 $exon->seq_id($genomseq->{'seqname'}); | |
313 } else { | |
314 # take filename stripped of path as fall back | |
315 my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*'); | |
316 $exon->seq_id($basename); | |
317 } | |
318 $exon->feature1()->add_tag_value('filename', | |
319 $genomseq->{'filename'}); | |
320 # feature1 is supposed to be initialized to a Similarity object, | |
321 # but we provide a safety net | |
322 if($exon->feature1()->can('seqlength')) { | |
323 $exon->feature1()->seqlength($genomseq->{'length'}); | |
324 } else { | |
325 $exon->feature1()->add_tag_value('SeqLength', | |
326 $genomseq->{'length'}); | |
327 } | |
328 # create and initialize the feature wrapping the 'hit' (the EST) | |
329 my $fea2 = Bio::SeqFeature::Similarity->new( | |
330 '-start' => $estseq->{'start'}, | |
331 '-end' => $estseq->{'end'}, | |
332 '-strand' => 0, | |
333 '-primary' => "aligning_EST"); | |
334 if(exists($estseq->{'seqname'})) { | |
335 $fea2->seq_id($estseq->{'seqname'}); | |
336 } else { | |
337 # take filename stripped of path as fall back | |
338 my ($basename) = | |
339 &File::Basename::fileparse($estseq->{'filename'}, '\..*'); | |
340 $fea2->seq_id($basename); | |
341 } | |
342 $fea2->add_tag_value('filename', $estseq->{'filename'}); | |
343 $fea2->seqlength($estseq->{'length'}); | |
344 # store | |
345 $exon->est_hit($fea2); | |
346 # general properties | |
347 $exon->source_tag($self->analysis_method()); | |
348 $exon->percentage_id($pctid); | |
349 $exon->score($exon->percentage_id()); | |
350 # push onto array | |
351 push(@exons, $exon); | |
352 next; # back to while loop | |
353 } | |
354 } | |
355 return @exons; | |
356 } | |
357 | |
358 =head2 next_exonset | |
359 | |
360 Title : next_exonset | |
361 Usage : $exonset = $sim4_result->parse_next_exonset; | |
362 print "Exons start at ", $exonset->start(), | |
363 "and end at ", $exonset->end(), "\n"; | |
364 foreach $exon ($exonset->sub_SeqFeature()) { | |
365 # do something | |
366 } | |
367 Function: Parses the next alignment of the Sim4 result file and returns the | |
368 set of exons as a container of features. The container is itself | |
369 a Bio::SeqFeature::Generic object, with the Bio::Tools::Sim4::Exon | |
370 objects as sub features. Start, end, and strand of the container | |
371 will represent the total region covered by the exons of this set. | |
372 | |
373 See the documentation of parse_next_alignment() for further | |
374 reference about parsing and how the information is stored. | |
375 | |
376 Example : | |
377 Returns : An Bio::SeqFeature::Generic object holding Bio::Tools::Sim4::Exon | |
378 objects as sub features. | |
379 Args : | |
380 | |
381 =cut | |
382 | |
383 sub next_exonset { | |
384 my $self = shift; | |
385 my $exonset; | |
386 | |
387 # get the next array of exons | |
388 my @exons = $self->parse_next_alignment(); | |
389 return if($#exons < 0); | |
390 # create the container of exons as a feature object itself, with the | |
391 # data of the first exon for initialization | |
392 $exonset = Bio::SeqFeature::Generic->new('-start' => $exons[0]->start(), | |
393 '-end' => $exons[0]->end(), | |
394 '-strand' => $exons[0]->strand(), | |
395 '-primary' => "ExonSet"); | |
396 $exonset->source_tag($exons[0]->source_tag()); | |
397 $exonset->seq_id($exons[0]->seq_id()); | |
398 # now add all exons as sub features, with enabling EXPANsion of the region | |
399 # covered in total | |
400 foreach my $exon (@exons) { | |
401 $exonset->add_sub_SeqFeature($exon, 'EXPAND'); | |
402 } | |
403 return $exonset; | |
404 } | |
405 | |
406 =head2 next_feature | |
407 | |
408 Title : next_feature | |
409 Usage : while($exonset = $sim4->next_feature()) { | |
410 # do something | |
411 } | |
412 Function: Does the same as L<next_exonset()>. See there for documentation of | |
413 the functionality. Call this method repeatedly until FALSE is | |
414 returned. | |
415 | |
416 The returned object is actually a SeqFeatureI implementing object. | |
417 This method is required for classes implementing the | |
418 SeqAnalysisParserI interface, and is merely an alias for | |
419 next_exonset() at present. | |
420 | |
421 Example : | |
422 Returns : A Bio::SeqFeature::Generic object. | |
423 Args : | |
424 | |
425 =cut | |
426 | |
427 sub next_feature { | |
428 my ($self,@args) = @_; | |
429 # even though next_exonset doesn't expect any args (and this method | |
430 # does neither), we pass on args in order to be prepared if this changes | |
431 # ever | |
432 return $self->next_exonset(@args); | |
433 } | |
434 | |
435 1; |