comparison variant_effect_predictor/Bio/Tools/Sim4/Results.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1
2 #
3 # BioPerl module for Bio::Tools::Sim4::Results
4 #
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
6 # and Hilmar Lapp <hlapp@gmx.net>
7 #
8 # Copyright Ewan Birney and Hilmar Lapp
9 #
10 # You may distribute this module under the same terms as perl itself
11
12 # POD documentation - main docs before the code
13
14 =head1 NAME
15
16 Bio::Tools::Sim4::Results - Results of one Sim4 run
17
18 =head1 SYNOPSIS
19
20 # to preset the order of EST and genomic file as given on the sim4
21 # command line:
22 my $sim4 = Bio::Tools::Sim4::Results->new(-file => 'result.sim4',
23 -estfirst => 1);
24 # to let the order be determined automatically (by length comparison):
25 $sim4 = Bio::Tools::Sim4::Results->new( -file => 'sim4.results' );
26 # filehandle:
27 $sim4 = Bio::Tools::Sim4::Results->new( -fh => \*INPUT );
28
29 # parse the results
30 while(my $exonset = $sim4->next_exonset()) {
31 # $exonset is-a Bio::SeqFeature::Generic with Bio::Tools::Sim4::Exons
32 # as sub features
33 print "Delimited on sequence ", $exonset->seq_id(),
34 "from ", $exonset->start(), " to ", $exonset->end(), "\n";
35 foreach my $exon ( $exonset->sub_SeqFeature() ) {
36 # $exon is-a Bio::SeqFeature::FeaturePair
37 print "Exon from ", $exon->start, " to ", $exon->end,
38 " on strand ", $exon->strand(), "\n";
39 # you can get out what it matched using the est_hit attribute
40 my $homol = $exon->est_hit();
41 print "Matched to sequence ", $homol->seq_id,
42 " at ", $homol->start," to ", $homol->end, "\n";
43 }
44 }
45
46 # essential if you gave a filename at initialization (otherwise the file
47 # stays open)
48 $sim4->close();
49
50 =head1 DESCRIPTION
51
52 The sim4 module provides a parser and results object for sim4 output. The
53 sim4 results are specialised types of SeqFeatures, meaning you can add them
54 to AnnSeq objects fine, and manipulate them in the "normal" seqfeature manner.
55
56 The sim4 Exon objects are Bio::SeqFeature::FeaturePair inherited objects. The
57 $esthit = $exon-E<gt>est_hit() is the alignment as a feature on the matching
58 object (normally, an EST), in which the start/end points are where the hit
59 lies.
60
61 To make this module work sensibly you need to run
62
63 sim4 genomic.fasta est.database.fasta
64 or
65 sim4 est.fasta genomic.database.fasta
66
67 To get the sequence identifiers recorded for the first sequence, too, use
68 A=4 as output option for sim4.
69
70 One fiddle here is that there are only two real possibilities to the matching
71 criteria: either one sequence needs reversing or not. Because of this, it
72 is impossible to tell whether the match is in the forward or reverse strand
73 of the genomic DNA. We solve this here by assuming that the genomic DNA is
74 always forward. As a consequence, the strand attribute of the matching EST is
75 unknown, and the strand attribute of the genomic DNA (i.e., the Exon object)
76 will reflect the direction of the hit.
77
78 See the documentation of parse_next_alignment() for abilities of the parser
79 to deal with the different output format options of sim4.
80
81 =head1 FEEDBACK
82
83 =head2 Mailing Lists
84
85 User feedback is an integral part of the evolution of this and other
86 Bioperl modules. Send your comments and suggestions preferably to one
87 of the Bioperl mailing lists. Your participation is much appreciated.
88
89 bioperl-l@bioperl.org - General discussion
90 http://bio.perl.org/MailList.html - About the mailing lists
91
92 =head2 Reporting Bugs
93
94 Report bugs to the Bioperl bug tracking system to help us keep track
95 the bugs and their resolution. Bug reports can be submitted via email
96 or the web:
97
98 bioperl-bugs@bio.perl.org
99 http://bugzilla.bioperl.org/
100
101 =head1 AUTHOR - Ewan Birney, Hilmar Lapp
102
103 Email birney@sanger.ac.uk
104 hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
105
106 Describe contact details here
107
108 =head1 APPENDIX
109
110 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
111
112 =cut
113
114
115 # Let the code begin...
116
117
118 package Bio::Tools::Sim4::Results;
119 use vars qw(@ISA);
120 use strict;
121
122 # Object preamble - inherits from Bio::Root::Object
123
124 use File::Basename;
125 use Bio::Root::Root;
126 use Bio::Tools::AnalysisResult;
127 use Bio::Tools::Sim4::Exon;
128
129 @ISA = qw(Bio::Tools::AnalysisResult);
130
131
132 sub _initialize_state {
133 my($self,@args) = @_;
134
135 # call the inherited method first
136 my $make = $self->SUPER::_initialize_state(@args);
137
138 my ($est_is_first) = $self->_rearrange([qw(ESTFIRST)], @args);
139
140 delete($self->{'_est_is_first'});
141 $self->{'_est_is_first'} = $est_is_first if(defined($est_is_first));
142 $self->analysis_method("Sim4");
143 }
144
145 =head2 analysis_method
146
147 Usage : $sim4->analysis_method();
148 Purpose : Inherited method. Overridden to ensure that the name matches
149 /sim4/i.
150 Returns : String
151 Argument : n/a
152
153 =cut
154
155 #-------------
156 sub analysis_method {
157 #-------------
158 my ($self, $method) = @_;
159 if($method && ($method !~ /sim4/i)) {
160 $self->throw("method $method not supported in " . ref($self));
161 }
162 return $self->SUPER::analysis_method($method);
163 }
164
165 =head2 parse_next_alignment
166
167 Title : parse_next_alignment
168 Usage : @exons = $sim4_result->parse_next_alignment;
169 foreach $exon (@exons) {
170 # do something
171 }
172 Function: Parses the next alignment of the Sim4 result file and returns the
173 found exons as an array of Bio::Tools::Sim4::Exon objects. Call
174 this method repeatedly until an empty array is returned to get the
175 results for all alignments.
176
177 The $exon->seq_id() attribute will be set to the identifier of the
178 respective sequence for both sequences if A=4 was used in the sim4
179 run, and otherwise for the second sequence only. If the output does
180 not contain the identifier, the filename stripped of path and
181 extension is used instead. In addition, the full filename
182 will be recorded for both features ($exon inherits off
183 Bio::SeqFeature::SimilarityPair) as tag 'filename'. The length
184 is accessible via the seqlength() attribute of $exon->query() and
185 $exon->est_hit().
186
187 Note that this method is capable of dealing with outputs generated
188 with format 0,1,3, and 4 (via the A=n option to sim4). It
189 automatically determines which of the two sequences has been
190 reversed, and adjusts the coordinates for that sequence. It will
191 also detect whether the EST sequence(s) were given as first or as
192 second file to sim4, unless this has been specified at creation
193 time of the object.
194
195 Example :
196 Returns : An array of Bio::Tools::Sim4::Exon objects
197 Args :
198
199
200 =cut
201
202 sub parse_next_alignment {
203 my ($self) = @_;
204 my @exons = ();
205 my %seq1props = ();
206 my %seq2props = ();
207 # we refer to the properties of each seq by reference
208 my ($estseq, $genomseq, $to_reverse);
209 my $started = 0;
210 my $hit_direction = 1;
211 my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced
212 # output yet)
213
214 while(defined($_ = $self->_readline())) {
215 #chomp();
216
217 #
218 # bascially, each sim4 'hit' starts with seq1...
219 #
220 /^seq1/ && do {
221 if($started) {
222 $self->_pushback($_);
223 last;
224 }
225 $started = 1;
226
227 # filename and length of seq 1
228 /^seq1\s+=\s+(\S+)\,\s+(\d+)/ ||
229 $self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!");
230 $seq1props{'filename'} = $1;
231 $seq1props{'length'} = $2;
232 next;
233 };
234 /^seq2/ && do {
235 # the second hit has also the database name in the >name syntax
236 # (in brackets).
237 /^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/||
238 $self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!");
239 $seq2props{'filename'} = $1;
240 $seq2props{'seqname'} = $2;
241 $seq2props{'length'} = $3;
242 next;
243 };
244 if(/^>(\S+)\s*(.*)$/) {
245 # output option was A=4, which not only gives the complete
246 # description lines, but also causes the longer sequence to be
247 # reversed if the second file contained one (genomic) sequence
248 $seq1props{'seqname'} = $1;
249 $seq1props{'description'} = $2 if $2;
250 $output_fmt = 4;
251 # we handle seq1 and seq2 both here
252 if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) {
253 $seq2props{'seqname'} = $1; # redundant, since already set above
254 $seq2props{'description'} = $2 if $2;
255 }
256 next;
257 }
258 /^\(complement\)/ && do {
259 $hit_direction = -1;
260 next;
261 };
262 # this matches
263 # start-end (start-end) pctid%
264 if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) {
265 $seq1props{'start'} = $1;
266 $seq1props{'end'} = $2;
267 $seq2props{'start'} = $3;
268 $seq2props{'end'} = $4;
269 my $pctid = $5;
270
271 if(! defined($estseq)) {
272 # for the first time here: need to set the references referring
273 # to seq1 and seq2
274 if(! exists($self->{'_est_is_first'})) {
275 # detect which one is the EST by looking at the lengths,
276 # and assume that this holds throughout the entire result
277 # file (i.e., when this method is called for the next
278 # alignment, this will not be checked again)
279 if($seq1props{'length'} > $seq2props{'length'}) {
280 $self->{'_est_is_first'} = 0;
281 } else {
282 $self->{'_est_is_first'} = 1;
283 }
284 }
285 if($self->{'_est_is_first'}) {
286 $estseq = \%seq1props;
287 $genomseq = \%seq2props;
288 # if the EST is given first, A=4 selects the genomic
289 # seq for being reversed (reversing the EST is default)
290 $to_reverse = ($output_fmt == 4) ? $genomseq : $estseq;
291 } else {
292 $estseq = \%seq2props;
293 $genomseq = \%seq1props;
294 # if the EST is the second, A=4 does not change the
295 # seq being reversed (always the EST is reversed)
296 $to_reverse = $estseq;
297 }
298 }
299 if($hit_direction == -1) {
300 # we have to reverse the coordinates of one of both seqs
301 my $tmp = $to_reverse->{'start'};
302 $to_reverse->{'start'} =
303 $to_reverse->{'length'} - $to_reverse->{'end'} + 1;
304 $to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1;
305 }
306 # create and initialize the exon object
307 my $exon = Bio::Tools::Sim4::Exon->new(
308 '-start' => $genomseq->{'start'},
309 '-end' => $genomseq->{'end'},
310 '-strand' => $hit_direction);
311 if(exists($genomseq->{'seqname'})) {
312 $exon->seq_id($genomseq->{'seqname'});
313 } else {
314 # take filename stripped of path as fall back
315 my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*');
316 $exon->seq_id($basename);
317 }
318 $exon->feature1()->add_tag_value('filename',
319 $genomseq->{'filename'});
320 # feature1 is supposed to be initialized to a Similarity object,
321 # but we provide a safety net
322 if($exon->feature1()->can('seqlength')) {
323 $exon->feature1()->seqlength($genomseq->{'length'});
324 } else {
325 $exon->feature1()->add_tag_value('SeqLength',
326 $genomseq->{'length'});
327 }
328 # create and initialize the feature wrapping the 'hit' (the EST)
329 my $fea2 = Bio::SeqFeature::Similarity->new(
330 '-start' => $estseq->{'start'},
331 '-end' => $estseq->{'end'},
332 '-strand' => 0,
333 '-primary' => "aligning_EST");
334 if(exists($estseq->{'seqname'})) {
335 $fea2->seq_id($estseq->{'seqname'});
336 } else {
337 # take filename stripped of path as fall back
338 my ($basename) =
339 &File::Basename::fileparse($estseq->{'filename'}, '\..*');
340 $fea2->seq_id($basename);
341 }
342 $fea2->add_tag_value('filename', $estseq->{'filename'});
343 $fea2->seqlength($estseq->{'length'});
344 # store
345 $exon->est_hit($fea2);
346 # general properties
347 $exon->source_tag($self->analysis_method());
348 $exon->percentage_id($pctid);
349 $exon->score($exon->percentage_id());
350 # push onto array
351 push(@exons, $exon);
352 next; # back to while loop
353 }
354 }
355 return @exons;
356 }
357
358 =head2 next_exonset
359
360 Title : next_exonset
361 Usage : $exonset = $sim4_result->parse_next_exonset;
362 print "Exons start at ", $exonset->start(),
363 "and end at ", $exonset->end(), "\n";
364 foreach $exon ($exonset->sub_SeqFeature()) {
365 # do something
366 }
367 Function: Parses the next alignment of the Sim4 result file and returns the
368 set of exons as a container of features. The container is itself
369 a Bio::SeqFeature::Generic object, with the Bio::Tools::Sim4::Exon
370 objects as sub features. Start, end, and strand of the container
371 will represent the total region covered by the exons of this set.
372
373 See the documentation of parse_next_alignment() for further
374 reference about parsing and how the information is stored.
375
376 Example :
377 Returns : An Bio::SeqFeature::Generic object holding Bio::Tools::Sim4::Exon
378 objects as sub features.
379 Args :
380
381 =cut
382
383 sub next_exonset {
384 my $self = shift;
385 my $exonset;
386
387 # get the next array of exons
388 my @exons = $self->parse_next_alignment();
389 return if($#exons < 0);
390 # create the container of exons as a feature object itself, with the
391 # data of the first exon for initialization
392 $exonset = Bio::SeqFeature::Generic->new('-start' => $exons[0]->start(),
393 '-end' => $exons[0]->end(),
394 '-strand' => $exons[0]->strand(),
395 '-primary' => "ExonSet");
396 $exonset->source_tag($exons[0]->source_tag());
397 $exonset->seq_id($exons[0]->seq_id());
398 # now add all exons as sub features, with enabling EXPANsion of the region
399 # covered in total
400 foreach my $exon (@exons) {
401 $exonset->add_sub_SeqFeature($exon, 'EXPAND');
402 }
403 return $exonset;
404 }
405
406 =head2 next_feature
407
408 Title : next_feature
409 Usage : while($exonset = $sim4->next_feature()) {
410 # do something
411 }
412 Function: Does the same as L<next_exonset()>. See there for documentation of
413 the functionality. Call this method repeatedly until FALSE is
414 returned.
415
416 The returned object is actually a SeqFeatureI implementing object.
417 This method is required for classes implementing the
418 SeqAnalysisParserI interface, and is merely an alias for
419 next_exonset() at present.
420
421 Example :
422 Returns : A Bio::SeqFeature::Generic object.
423 Args :
424
425 =cut
426
427 sub next_feature {
428 my ($self,@args) = @_;
429 # even though next_exonset doesn't expect any args (and this method
430 # does neither), we pass on args in order to be prepared if this changes
431 # ever
432 return $self->next_exonset(@args);
433 }
434
435 1;