annotate variant_effect_predictor/Bio/Tools/Sim4/Results.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3 # BioPerl module for Bio::Tools::Sim4::Results
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6 # and Hilmar Lapp <hlapp@gmx.net>
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8 # Copyright Ewan Birney and Hilmar Lapp
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10 # You may distribute this module under the same terms as perl itself
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12 # POD documentation - main docs before the code
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14 =head1 NAME
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16 Bio::Tools::Sim4::Results - Results of one Sim4 run
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18 =head1 SYNOPSIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20 # to preset the order of EST and genomic file as given on the sim4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 # command line:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22 my $sim4 = Bio::Tools::Sim4::Results->new(-file => 'result.sim4',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 -estfirst => 1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24 # to let the order be determined automatically (by length comparison):
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25 $sim4 = Bio::Tools::Sim4::Results->new( -file => 'sim4.results' );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26 # filehandle:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27 $sim4 = Bio::Tools::Sim4::Results->new( -fh => \*INPUT );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29 # parse the results
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30 while(my $exonset = $sim4->next_exonset()) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31 # $exonset is-a Bio::SeqFeature::Generic with Bio::Tools::Sim4::Exons
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32 # as sub features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33 print "Delimited on sequence ", $exonset->seq_id(),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34 "from ", $exonset->start(), " to ", $exonset->end(), "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35 foreach my $exon ( $exonset->sub_SeqFeature() ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36 # $exon is-a Bio::SeqFeature::FeaturePair
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37 print "Exon from ", $exon->start, " to ", $exon->end,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38 " on strand ", $exon->strand(), "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 # you can get out what it matched using the est_hit attribute
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40 my $homol = $exon->est_hit();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41 print "Matched to sequence ", $homol->seq_id,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42 " at ", $homol->start," to ", $homol->end, "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46 # essential if you gave a filename at initialization (otherwise the file
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47 # stays open)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48 $sim4->close();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 =head1 DESCRIPTION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52 The sim4 module provides a parser and results object for sim4 output. The
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53 sim4 results are specialised types of SeqFeatures, meaning you can add them
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54 to AnnSeq objects fine, and manipulate them in the "normal" seqfeature manner.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56 The sim4 Exon objects are Bio::SeqFeature::FeaturePair inherited objects. The
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57 $esthit = $exon-E<gt>est_hit() is the alignment as a feature on the matching
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58 object (normally, an EST), in which the start/end points are where the hit
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59 lies.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61 To make this module work sensibly you need to run
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63 sim4 genomic.fasta est.database.fasta
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64 or
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65 sim4 est.fasta genomic.database.fasta
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67 To get the sequence identifiers recorded for the first sequence, too, use
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68 A=4 as output option for sim4.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70 One fiddle here is that there are only two real possibilities to the matching
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71 criteria: either one sequence needs reversing or not. Because of this, it
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72 is impossible to tell whether the match is in the forward or reverse strand
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73 of the genomic DNA. We solve this here by assuming that the genomic DNA is
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74 always forward. As a consequence, the strand attribute of the matching EST is
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75 unknown, and the strand attribute of the genomic DNA (i.e., the Exon object)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76 will reflect the direction of the hit.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78 See the documentation of parse_next_alignment() for abilities of the parser
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79 to deal with the different output format options of sim4.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81 =head1 FEEDBACK
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83 =head2 Mailing Lists
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85 User feedback is an integral part of the evolution of this and other
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 Bioperl modules. Send your comments and suggestions preferably to one
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 of the Bioperl mailing lists. Your participation is much appreciated.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89 bioperl-l@bioperl.org - General discussion
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90 http://bio.perl.org/MailList.html - About the mailing lists
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92 =head2 Reporting Bugs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94 Report bugs to the Bioperl bug tracking system to help us keep track
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95 the bugs and their resolution. Bug reports can be submitted via email
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96 or the web:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98 bioperl-bugs@bio.perl.org
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99 http://bugzilla.bioperl.org/
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 =head1 AUTHOR - Ewan Birney, Hilmar Lapp
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103 Email birney@sanger.ac.uk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104 hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106 Describe contact details here
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108 =head1 APPENDIX
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115 # Let the code begin...
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118 package Bio::Tools::Sim4::Results;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119 use vars qw(@ISA);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122 # Object preamble - inherits from Bio::Root::Object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124 use File::Basename;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125 use Bio::Root::Root;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126 use Bio::Tools::AnalysisResult;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127 use Bio::Tools::Sim4::Exon;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129 @ISA = qw(Bio::Tools::AnalysisResult);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132 sub _initialize_state {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133 my($self,@args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135 # call the inherited method first
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136 my $make = $self->SUPER::_initialize_state(@args);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138 my ($est_is_first) = $self->_rearrange([qw(ESTFIRST)], @args);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140 delete($self->{'_est_is_first'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141 $self->{'_est_is_first'} = $est_is_first if(defined($est_is_first));
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142 $self->analysis_method("Sim4");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145 =head2 analysis_method
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 Usage : $sim4->analysis_method();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148 Purpose : Inherited method. Overridden to ensure that the name matches
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149 /sim4/i.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150 Returns : String
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151 Argument : n/a
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155 #-------------
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156 sub analysis_method {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157 #-------------
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158 my ($self, $method) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159 if($method && ($method !~ /sim4/i)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
160 $self->throw("method $method not supported in " . ref($self));
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
161 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
162 return $self->SUPER::analysis_method($method);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
163 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
164
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
165 =head2 parse_next_alignment
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
166
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
167 Title : parse_next_alignment
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
168 Usage : @exons = $sim4_result->parse_next_alignment;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
169 foreach $exon (@exons) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
170 # do something
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
171 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
172 Function: Parses the next alignment of the Sim4 result file and returns the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
173 found exons as an array of Bio::Tools::Sim4::Exon objects. Call
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
174 this method repeatedly until an empty array is returned to get the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
175 results for all alignments.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
176
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
177 The $exon->seq_id() attribute will be set to the identifier of the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
178 respective sequence for both sequences if A=4 was used in the sim4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
179 run, and otherwise for the second sequence only. If the output does
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
180 not contain the identifier, the filename stripped of path and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
181 extension is used instead. In addition, the full filename
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
182 will be recorded for both features ($exon inherits off
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
183 Bio::SeqFeature::SimilarityPair) as tag 'filename'. The length
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
184 is accessible via the seqlength() attribute of $exon->query() and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
185 $exon->est_hit().
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
186
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
187 Note that this method is capable of dealing with outputs generated
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
188 with format 0,1,3, and 4 (via the A=n option to sim4). It
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
189 automatically determines which of the two sequences has been
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
190 reversed, and adjusts the coordinates for that sequence. It will
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
191 also detect whether the EST sequence(s) were given as first or as
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
192 second file to sim4, unless this has been specified at creation
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
193 time of the object.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
194
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
195 Example :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
196 Returns : An array of Bio::Tools::Sim4::Exon objects
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
197 Args :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
198
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
199
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
200 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
201
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
202 sub parse_next_alignment {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
203 my ($self) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
204 my @exons = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
205 my %seq1props = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
206 my %seq2props = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
207 # we refer to the properties of each seq by reference
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
208 my ($estseq, $genomseq, $to_reverse);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
209 my $started = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
210 my $hit_direction = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
211 my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
212 # output yet)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
213
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
214 while(defined($_ = $self->_readline())) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
215 #chomp();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
216
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
217 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
218 # bascially, each sim4 'hit' starts with seq1...
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
219 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
220 /^seq1/ && do {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
221 if($started) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
222 $self->_pushback($_);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
223 last;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
224 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
225 $started = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
226
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
227 # filename and length of seq 1
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
228 /^seq1\s+=\s+(\S+)\,\s+(\d+)/ ||
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
229 $self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
230 $seq1props{'filename'} = $1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
231 $seq1props{'length'} = $2;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
232 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
233 };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
234 /^seq2/ && do {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
235 # the second hit has also the database name in the >name syntax
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
236 # (in brackets).
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
237 /^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/||
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
238 $self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
239 $seq2props{'filename'} = $1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
240 $seq2props{'seqname'} = $2;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
241 $seq2props{'length'} = $3;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
242 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
243 };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
244 if(/^>(\S+)\s*(.*)$/) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
245 # output option was A=4, which not only gives the complete
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
246 # description lines, but also causes the longer sequence to be
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
247 # reversed if the second file contained one (genomic) sequence
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
248 $seq1props{'seqname'} = $1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
249 $seq1props{'description'} = $2 if $2;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
250 $output_fmt = 4;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
251 # we handle seq1 and seq2 both here
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
252 if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
253 $seq2props{'seqname'} = $1; # redundant, since already set above
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
254 $seq2props{'description'} = $2 if $2;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
255 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
256 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
257 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
258 /^\(complement\)/ && do {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
259 $hit_direction = -1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
260 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
261 };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
262 # this matches
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
263 # start-end (start-end) pctid%
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
264 if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
265 $seq1props{'start'} = $1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
266 $seq1props{'end'} = $2;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
267 $seq2props{'start'} = $3;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
268 $seq2props{'end'} = $4;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
269 my $pctid = $5;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
270
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
271 if(! defined($estseq)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
272 # for the first time here: need to set the references referring
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
273 # to seq1 and seq2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
274 if(! exists($self->{'_est_is_first'})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
275 # detect which one is the EST by looking at the lengths,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
276 # and assume that this holds throughout the entire result
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
277 # file (i.e., when this method is called for the next
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
278 # alignment, this will not be checked again)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
279 if($seq1props{'length'} > $seq2props{'length'}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
280 $self->{'_est_is_first'} = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
281 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
282 $self->{'_est_is_first'} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
283 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
284 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
285 if($self->{'_est_is_first'}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
286 $estseq = \%seq1props;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
287 $genomseq = \%seq2props;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
288 # if the EST is given first, A=4 selects the genomic
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
289 # seq for being reversed (reversing the EST is default)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
290 $to_reverse = ($output_fmt == 4) ? $genomseq : $estseq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
291 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
292 $estseq = \%seq2props;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
293 $genomseq = \%seq1props;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
294 # if the EST is the second, A=4 does not change the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
295 # seq being reversed (always the EST is reversed)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
296 $to_reverse = $estseq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
297 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
298 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
299 if($hit_direction == -1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
300 # we have to reverse the coordinates of one of both seqs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
301 my $tmp = $to_reverse->{'start'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
302 $to_reverse->{'start'} =
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
303 $to_reverse->{'length'} - $to_reverse->{'end'} + 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
304 $to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
305 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
306 # create and initialize the exon object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
307 my $exon = Bio::Tools::Sim4::Exon->new(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
308 '-start' => $genomseq->{'start'},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
309 '-end' => $genomseq->{'end'},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
310 '-strand' => $hit_direction);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
311 if(exists($genomseq->{'seqname'})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
312 $exon->seq_id($genomseq->{'seqname'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
313 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
314 # take filename stripped of path as fall back
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
315 my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
316 $exon->seq_id($basename);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
317 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
318 $exon->feature1()->add_tag_value('filename',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
319 $genomseq->{'filename'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
320 # feature1 is supposed to be initialized to a Similarity object,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
321 # but we provide a safety net
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
322 if($exon->feature1()->can('seqlength')) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
323 $exon->feature1()->seqlength($genomseq->{'length'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
324 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
325 $exon->feature1()->add_tag_value('SeqLength',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
326 $genomseq->{'length'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
327 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
328 # create and initialize the feature wrapping the 'hit' (the EST)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
329 my $fea2 = Bio::SeqFeature::Similarity->new(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
330 '-start' => $estseq->{'start'},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
331 '-end' => $estseq->{'end'},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
332 '-strand' => 0,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
333 '-primary' => "aligning_EST");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
334 if(exists($estseq->{'seqname'})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
335 $fea2->seq_id($estseq->{'seqname'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
336 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
337 # take filename stripped of path as fall back
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
338 my ($basename) =
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
339 &File::Basename::fileparse($estseq->{'filename'}, '\..*');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
340 $fea2->seq_id($basename);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
341 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
342 $fea2->add_tag_value('filename', $estseq->{'filename'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
343 $fea2->seqlength($estseq->{'length'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
344 # store
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
345 $exon->est_hit($fea2);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
346 # general properties
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
347 $exon->source_tag($self->analysis_method());
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
348 $exon->percentage_id($pctid);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
349 $exon->score($exon->percentage_id());
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
350 # push onto array
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
351 push(@exons, $exon);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
352 next; # back to while loop
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
353 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
354 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
355 return @exons;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
356 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
357
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
358 =head2 next_exonset
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
359
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
360 Title : next_exonset
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
361 Usage : $exonset = $sim4_result->parse_next_exonset;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
362 print "Exons start at ", $exonset->start(),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
363 "and end at ", $exonset->end(), "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
364 foreach $exon ($exonset->sub_SeqFeature()) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
365 # do something
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
366 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
367 Function: Parses the next alignment of the Sim4 result file and returns the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
368 set of exons as a container of features. The container is itself
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
369 a Bio::SeqFeature::Generic object, with the Bio::Tools::Sim4::Exon
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
370 objects as sub features. Start, end, and strand of the container
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
371 will represent the total region covered by the exons of this set.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
372
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
373 See the documentation of parse_next_alignment() for further
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
374 reference about parsing and how the information is stored.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
375
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
376 Example :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
377 Returns : An Bio::SeqFeature::Generic object holding Bio::Tools::Sim4::Exon
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
378 objects as sub features.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
379 Args :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
380
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
381 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
382
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
383 sub next_exonset {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
384 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
385 my $exonset;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
386
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
387 # get the next array of exons
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
388 my @exons = $self->parse_next_alignment();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
389 return if($#exons < 0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
390 # create the container of exons as a feature object itself, with the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
391 # data of the first exon for initialization
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
392 $exonset = Bio::SeqFeature::Generic->new('-start' => $exons[0]->start(),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
393 '-end' => $exons[0]->end(),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
394 '-strand' => $exons[0]->strand(),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
395 '-primary' => "ExonSet");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
396 $exonset->source_tag($exons[0]->source_tag());
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
397 $exonset->seq_id($exons[0]->seq_id());
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
398 # now add all exons as sub features, with enabling EXPANsion of the region
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
399 # covered in total
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
400 foreach my $exon (@exons) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
401 $exonset->add_sub_SeqFeature($exon, 'EXPAND');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
402 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
403 return $exonset;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
404 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
405
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
406 =head2 next_feature
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
407
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
408 Title : next_feature
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
409 Usage : while($exonset = $sim4->next_feature()) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
410 # do something
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
411 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
412 Function: Does the same as L<next_exonset()>. See there for documentation of
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
413 the functionality. Call this method repeatedly until FALSE is
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
414 returned.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
415
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
416 The returned object is actually a SeqFeatureI implementing object.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
417 This method is required for classes implementing the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
418 SeqAnalysisParserI interface, and is merely an alias for
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
419 next_exonset() at present.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
420
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
421 Example :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
422 Returns : A Bio::SeqFeature::Generic object.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
423 Args :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
424
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
425 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
426
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
427 sub next_feature {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
428 my ($self,@args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
429 # even though next_exonset doesn't expect any args (and this method
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
430 # does neither), we pass on args in order to be prepared if this changes
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
431 # ever
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
432 return $self->next_exonset(@args);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
433 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
434
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
435 1;