0
|
1
|
|
2 #
|
|
3 # BioPerl module for Bio::Tools::Sim4::Results
|
|
4 #
|
|
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
|
|
6 # and Hilmar Lapp <hlapp@gmx.net>
|
|
7 #
|
|
8 # Copyright Ewan Birney and Hilmar Lapp
|
|
9 #
|
|
10 # You may distribute this module under the same terms as perl itself
|
|
11
|
|
12 # POD documentation - main docs before the code
|
|
13
|
|
14 =head1 NAME
|
|
15
|
|
16 Bio::Tools::Sim4::Results - Results of one Sim4 run
|
|
17
|
|
18 =head1 SYNOPSIS
|
|
19
|
|
20 # to preset the order of EST and genomic file as given on the sim4
|
|
21 # command line:
|
|
22 my $sim4 = Bio::Tools::Sim4::Results->new(-file => 'result.sim4',
|
|
23 -estfirst => 1);
|
|
24 # to let the order be determined automatically (by length comparison):
|
|
25 $sim4 = Bio::Tools::Sim4::Results->new( -file => 'sim4.results' );
|
|
26 # filehandle:
|
|
27 $sim4 = Bio::Tools::Sim4::Results->new( -fh => \*INPUT );
|
|
28
|
|
29 # parse the results
|
|
30 while(my $exonset = $sim4->next_exonset()) {
|
|
31 # $exonset is-a Bio::SeqFeature::Generic with Bio::Tools::Sim4::Exons
|
|
32 # as sub features
|
|
33 print "Delimited on sequence ", $exonset->seq_id(),
|
|
34 "from ", $exonset->start(), " to ", $exonset->end(), "\n";
|
|
35 foreach my $exon ( $exonset->sub_SeqFeature() ) {
|
|
36 # $exon is-a Bio::SeqFeature::FeaturePair
|
|
37 print "Exon from ", $exon->start, " to ", $exon->end,
|
|
38 " on strand ", $exon->strand(), "\n";
|
|
39 # you can get out what it matched using the est_hit attribute
|
|
40 my $homol = $exon->est_hit();
|
|
41 print "Matched to sequence ", $homol->seq_id,
|
|
42 " at ", $homol->start," to ", $homol->end, "\n";
|
|
43 }
|
|
44 }
|
|
45
|
|
46 # essential if you gave a filename at initialization (otherwise the file
|
|
47 # stays open)
|
|
48 $sim4->close();
|
|
49
|
|
50 =head1 DESCRIPTION
|
|
51
|
|
52 The sim4 module provides a parser and results object for sim4 output. The
|
|
53 sim4 results are specialised types of SeqFeatures, meaning you can add them
|
|
54 to AnnSeq objects fine, and manipulate them in the "normal" seqfeature manner.
|
|
55
|
|
56 The sim4 Exon objects are Bio::SeqFeature::FeaturePair inherited objects. The
|
|
57 $esthit = $exon-E<gt>est_hit() is the alignment as a feature on the matching
|
|
58 object (normally, an EST), in which the start/end points are where the hit
|
|
59 lies.
|
|
60
|
|
61 To make this module work sensibly you need to run
|
|
62
|
|
63 sim4 genomic.fasta est.database.fasta
|
|
64 or
|
|
65 sim4 est.fasta genomic.database.fasta
|
|
66
|
|
67 To get the sequence identifiers recorded for the first sequence, too, use
|
|
68 A=4 as output option for sim4.
|
|
69
|
|
70 One fiddle here is that there are only two real possibilities to the matching
|
|
71 criteria: either one sequence needs reversing or not. Because of this, it
|
|
72 is impossible to tell whether the match is in the forward or reverse strand
|
|
73 of the genomic DNA. We solve this here by assuming that the genomic DNA is
|
|
74 always forward. As a consequence, the strand attribute of the matching EST is
|
|
75 unknown, and the strand attribute of the genomic DNA (i.e., the Exon object)
|
|
76 will reflect the direction of the hit.
|
|
77
|
|
78 See the documentation of parse_next_alignment() for abilities of the parser
|
|
79 to deal with the different output format options of sim4.
|
|
80
|
|
81 =head1 FEEDBACK
|
|
82
|
|
83 =head2 Mailing Lists
|
|
84
|
|
85 User feedback is an integral part of the evolution of this and other
|
|
86 Bioperl modules. Send your comments and suggestions preferably to one
|
|
87 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
88
|
|
89 bioperl-l@bioperl.org - General discussion
|
|
90 http://bio.perl.org/MailList.html - About the mailing lists
|
|
91
|
|
92 =head2 Reporting Bugs
|
|
93
|
|
94 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
95 the bugs and their resolution. Bug reports can be submitted via email
|
|
96 or the web:
|
|
97
|
|
98 bioperl-bugs@bio.perl.org
|
|
99 http://bugzilla.bioperl.org/
|
|
100
|
|
101 =head1 AUTHOR - Ewan Birney, Hilmar Lapp
|
|
102
|
|
103 Email birney@sanger.ac.uk
|
|
104 hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
|
|
105
|
|
106 Describe contact details here
|
|
107
|
|
108 =head1 APPENDIX
|
|
109
|
|
110 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
|
|
111
|
|
112 =cut
|
|
113
|
|
114
|
|
115 # Let the code begin...
|
|
116
|
|
117
|
|
118 package Bio::Tools::Sim4::Results;
|
|
119 use vars qw(@ISA);
|
|
120 use strict;
|
|
121
|
|
122 # Object preamble - inherits from Bio::Root::Object
|
|
123
|
|
124 use File::Basename;
|
|
125 use Bio::Root::Root;
|
|
126 use Bio::Tools::AnalysisResult;
|
|
127 use Bio::Tools::Sim4::Exon;
|
|
128
|
|
129 @ISA = qw(Bio::Tools::AnalysisResult);
|
|
130
|
|
131
|
|
132 sub _initialize_state {
|
|
133 my($self,@args) = @_;
|
|
134
|
|
135 # call the inherited method first
|
|
136 my $make = $self->SUPER::_initialize_state(@args);
|
|
137
|
|
138 my ($est_is_first) = $self->_rearrange([qw(ESTFIRST)], @args);
|
|
139
|
|
140 delete($self->{'_est_is_first'});
|
|
141 $self->{'_est_is_first'} = $est_is_first if(defined($est_is_first));
|
|
142 $self->analysis_method("Sim4");
|
|
143 }
|
|
144
|
|
145 =head2 analysis_method
|
|
146
|
|
147 Usage : $sim4->analysis_method();
|
|
148 Purpose : Inherited method. Overridden to ensure that the name matches
|
|
149 /sim4/i.
|
|
150 Returns : String
|
|
151 Argument : n/a
|
|
152
|
|
153 =cut
|
|
154
|
|
155 #-------------
|
|
156 sub analysis_method {
|
|
157 #-------------
|
|
158 my ($self, $method) = @_;
|
|
159 if($method && ($method !~ /sim4/i)) {
|
|
160 $self->throw("method $method not supported in " . ref($self));
|
|
161 }
|
|
162 return $self->SUPER::analysis_method($method);
|
|
163 }
|
|
164
|
|
165 =head2 parse_next_alignment
|
|
166
|
|
167 Title : parse_next_alignment
|
|
168 Usage : @exons = $sim4_result->parse_next_alignment;
|
|
169 foreach $exon (@exons) {
|
|
170 # do something
|
|
171 }
|
|
172 Function: Parses the next alignment of the Sim4 result file and returns the
|
|
173 found exons as an array of Bio::Tools::Sim4::Exon objects. Call
|
|
174 this method repeatedly until an empty array is returned to get the
|
|
175 results for all alignments.
|
|
176
|
|
177 The $exon->seq_id() attribute will be set to the identifier of the
|
|
178 respective sequence for both sequences if A=4 was used in the sim4
|
|
179 run, and otherwise for the second sequence only. If the output does
|
|
180 not contain the identifier, the filename stripped of path and
|
|
181 extension is used instead. In addition, the full filename
|
|
182 will be recorded for both features ($exon inherits off
|
|
183 Bio::SeqFeature::SimilarityPair) as tag 'filename'. The length
|
|
184 is accessible via the seqlength() attribute of $exon->query() and
|
|
185 $exon->est_hit().
|
|
186
|
|
187 Note that this method is capable of dealing with outputs generated
|
|
188 with format 0,1,3, and 4 (via the A=n option to sim4). It
|
|
189 automatically determines which of the two sequences has been
|
|
190 reversed, and adjusts the coordinates for that sequence. It will
|
|
191 also detect whether the EST sequence(s) were given as first or as
|
|
192 second file to sim4, unless this has been specified at creation
|
|
193 time of the object.
|
|
194
|
|
195 Example :
|
|
196 Returns : An array of Bio::Tools::Sim4::Exon objects
|
|
197 Args :
|
|
198
|
|
199
|
|
200 =cut
|
|
201
|
|
202 sub parse_next_alignment {
|
|
203 my ($self) = @_;
|
|
204 my @exons = ();
|
|
205 my %seq1props = ();
|
|
206 my %seq2props = ();
|
|
207 # we refer to the properties of each seq by reference
|
|
208 my ($estseq, $genomseq, $to_reverse);
|
|
209 my $started = 0;
|
|
210 my $hit_direction = 1;
|
|
211 my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced
|
|
212 # output yet)
|
|
213
|
|
214 while(defined($_ = $self->_readline())) {
|
|
215 #chomp();
|
|
216
|
|
217 #
|
|
218 # bascially, each sim4 'hit' starts with seq1...
|
|
219 #
|
|
220 /^seq1/ && do {
|
|
221 if($started) {
|
|
222 $self->_pushback($_);
|
|
223 last;
|
|
224 }
|
|
225 $started = 1;
|
|
226
|
|
227 # filename and length of seq 1
|
|
228 /^seq1\s+=\s+(\S+)\,\s+(\d+)/ ||
|
|
229 $self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!");
|
|
230 $seq1props{'filename'} = $1;
|
|
231 $seq1props{'length'} = $2;
|
|
232 next;
|
|
233 };
|
|
234 /^seq2/ && do {
|
|
235 # the second hit has also the database name in the >name syntax
|
|
236 # (in brackets).
|
|
237 /^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/||
|
|
238 $self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!");
|
|
239 $seq2props{'filename'} = $1;
|
|
240 $seq2props{'seqname'} = $2;
|
|
241 $seq2props{'length'} = $3;
|
|
242 next;
|
|
243 };
|
|
244 if(/^>(\S+)\s*(.*)$/) {
|
|
245 # output option was A=4, which not only gives the complete
|
|
246 # description lines, but also causes the longer sequence to be
|
|
247 # reversed if the second file contained one (genomic) sequence
|
|
248 $seq1props{'seqname'} = $1;
|
|
249 $seq1props{'description'} = $2 if $2;
|
|
250 $output_fmt = 4;
|
|
251 # we handle seq1 and seq2 both here
|
|
252 if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) {
|
|
253 $seq2props{'seqname'} = $1; # redundant, since already set above
|
|
254 $seq2props{'description'} = $2 if $2;
|
|
255 }
|
|
256 next;
|
|
257 }
|
|
258 /^\(complement\)/ && do {
|
|
259 $hit_direction = -1;
|
|
260 next;
|
|
261 };
|
|
262 # this matches
|
|
263 # start-end (start-end) pctid%
|
|
264 if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) {
|
|
265 $seq1props{'start'} = $1;
|
|
266 $seq1props{'end'} = $2;
|
|
267 $seq2props{'start'} = $3;
|
|
268 $seq2props{'end'} = $4;
|
|
269 my $pctid = $5;
|
|
270
|
|
271 if(! defined($estseq)) {
|
|
272 # for the first time here: need to set the references referring
|
|
273 # to seq1 and seq2
|
|
274 if(! exists($self->{'_est_is_first'})) {
|
|
275 # detect which one is the EST by looking at the lengths,
|
|
276 # and assume that this holds throughout the entire result
|
|
277 # file (i.e., when this method is called for the next
|
|
278 # alignment, this will not be checked again)
|
|
279 if($seq1props{'length'} > $seq2props{'length'}) {
|
|
280 $self->{'_est_is_first'} = 0;
|
|
281 } else {
|
|
282 $self->{'_est_is_first'} = 1;
|
|
283 }
|
|
284 }
|
|
285 if($self->{'_est_is_first'}) {
|
|
286 $estseq = \%seq1props;
|
|
287 $genomseq = \%seq2props;
|
|
288 # if the EST is given first, A=4 selects the genomic
|
|
289 # seq for being reversed (reversing the EST is default)
|
|
290 $to_reverse = ($output_fmt == 4) ? $genomseq : $estseq;
|
|
291 } else {
|
|
292 $estseq = \%seq2props;
|
|
293 $genomseq = \%seq1props;
|
|
294 # if the EST is the second, A=4 does not change the
|
|
295 # seq being reversed (always the EST is reversed)
|
|
296 $to_reverse = $estseq;
|
|
297 }
|
|
298 }
|
|
299 if($hit_direction == -1) {
|
|
300 # we have to reverse the coordinates of one of both seqs
|
|
301 my $tmp = $to_reverse->{'start'};
|
|
302 $to_reverse->{'start'} =
|
|
303 $to_reverse->{'length'} - $to_reverse->{'end'} + 1;
|
|
304 $to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1;
|
|
305 }
|
|
306 # create and initialize the exon object
|
|
307 my $exon = Bio::Tools::Sim4::Exon->new(
|
|
308 '-start' => $genomseq->{'start'},
|
|
309 '-end' => $genomseq->{'end'},
|
|
310 '-strand' => $hit_direction);
|
|
311 if(exists($genomseq->{'seqname'})) {
|
|
312 $exon->seq_id($genomseq->{'seqname'});
|
|
313 } else {
|
|
314 # take filename stripped of path as fall back
|
|
315 my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*');
|
|
316 $exon->seq_id($basename);
|
|
317 }
|
|
318 $exon->feature1()->add_tag_value('filename',
|
|
319 $genomseq->{'filename'});
|
|
320 # feature1 is supposed to be initialized to a Similarity object,
|
|
321 # but we provide a safety net
|
|
322 if($exon->feature1()->can('seqlength')) {
|
|
323 $exon->feature1()->seqlength($genomseq->{'length'});
|
|
324 } else {
|
|
325 $exon->feature1()->add_tag_value('SeqLength',
|
|
326 $genomseq->{'length'});
|
|
327 }
|
|
328 # create and initialize the feature wrapping the 'hit' (the EST)
|
|
329 my $fea2 = Bio::SeqFeature::Similarity->new(
|
|
330 '-start' => $estseq->{'start'},
|
|
331 '-end' => $estseq->{'end'},
|
|
332 '-strand' => 0,
|
|
333 '-primary' => "aligning_EST");
|
|
334 if(exists($estseq->{'seqname'})) {
|
|
335 $fea2->seq_id($estseq->{'seqname'});
|
|
336 } else {
|
|
337 # take filename stripped of path as fall back
|
|
338 my ($basename) =
|
|
339 &File::Basename::fileparse($estseq->{'filename'}, '\..*');
|
|
340 $fea2->seq_id($basename);
|
|
341 }
|
|
342 $fea2->add_tag_value('filename', $estseq->{'filename'});
|
|
343 $fea2->seqlength($estseq->{'length'});
|
|
344 # store
|
|
345 $exon->est_hit($fea2);
|
|
346 # general properties
|
|
347 $exon->source_tag($self->analysis_method());
|
|
348 $exon->percentage_id($pctid);
|
|
349 $exon->score($exon->percentage_id());
|
|
350 # push onto array
|
|
351 push(@exons, $exon);
|
|
352 next; # back to while loop
|
|
353 }
|
|
354 }
|
|
355 return @exons;
|
|
356 }
|
|
357
|
|
358 =head2 next_exonset
|
|
359
|
|
360 Title : next_exonset
|
|
361 Usage : $exonset = $sim4_result->parse_next_exonset;
|
|
362 print "Exons start at ", $exonset->start(),
|
|
363 "and end at ", $exonset->end(), "\n";
|
|
364 foreach $exon ($exonset->sub_SeqFeature()) {
|
|
365 # do something
|
|
366 }
|
|
367 Function: Parses the next alignment of the Sim4 result file and returns the
|
|
368 set of exons as a container of features. The container is itself
|
|
369 a Bio::SeqFeature::Generic object, with the Bio::Tools::Sim4::Exon
|
|
370 objects as sub features. Start, end, and strand of the container
|
|
371 will represent the total region covered by the exons of this set.
|
|
372
|
|
373 See the documentation of parse_next_alignment() for further
|
|
374 reference about parsing and how the information is stored.
|
|
375
|
|
376 Example :
|
|
377 Returns : An Bio::SeqFeature::Generic object holding Bio::Tools::Sim4::Exon
|
|
378 objects as sub features.
|
|
379 Args :
|
|
380
|
|
381 =cut
|
|
382
|
|
383 sub next_exonset {
|
|
384 my $self = shift;
|
|
385 my $exonset;
|
|
386
|
|
387 # get the next array of exons
|
|
388 my @exons = $self->parse_next_alignment();
|
|
389 return if($#exons < 0);
|
|
390 # create the container of exons as a feature object itself, with the
|
|
391 # data of the first exon for initialization
|
|
392 $exonset = Bio::SeqFeature::Generic->new('-start' => $exons[0]->start(),
|
|
393 '-end' => $exons[0]->end(),
|
|
394 '-strand' => $exons[0]->strand(),
|
|
395 '-primary' => "ExonSet");
|
|
396 $exonset->source_tag($exons[0]->source_tag());
|
|
397 $exonset->seq_id($exons[0]->seq_id());
|
|
398 # now add all exons as sub features, with enabling EXPANsion of the region
|
|
399 # covered in total
|
|
400 foreach my $exon (@exons) {
|
|
401 $exonset->add_sub_SeqFeature($exon, 'EXPAND');
|
|
402 }
|
|
403 return $exonset;
|
|
404 }
|
|
405
|
|
406 =head2 next_feature
|
|
407
|
|
408 Title : next_feature
|
|
409 Usage : while($exonset = $sim4->next_feature()) {
|
|
410 # do something
|
|
411 }
|
|
412 Function: Does the same as L<next_exonset()>. See there for documentation of
|
|
413 the functionality. Call this method repeatedly until FALSE is
|
|
414 returned.
|
|
415
|
|
416 The returned object is actually a SeqFeatureI implementing object.
|
|
417 This method is required for classes implementing the
|
|
418 SeqAnalysisParserI interface, and is merely an alias for
|
|
419 next_exonset() at present.
|
|
420
|
|
421 Example :
|
|
422 Returns : A Bio::SeqFeature::Generic object.
|
|
423 Args :
|
|
424
|
|
425 =cut
|
|
426
|
|
427 sub next_feature {
|
|
428 my ($self,@args) = @_;
|
|
429 # even though next_exonset doesn't expect any args (and this method
|
|
430 # does neither), we pass on args in order to be prepared if this changes
|
|
431 # ever
|
|
432 return $self->next_exonset(@args);
|
|
433 }
|
|
434
|
|
435 1;
|