Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Sim4/Results.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 | |
| 2 # | |
| 3 # BioPerl module for Bio::Tools::Sim4::Results | |
| 4 # | |
| 5 # Cared for by Ewan Birney <birney@sanger.ac.uk> | |
| 6 # and Hilmar Lapp <hlapp@gmx.net> | |
| 7 # | |
| 8 # Copyright Ewan Birney and Hilmar Lapp | |
| 9 # | |
| 10 # You may distribute this module under the same terms as perl itself | |
| 11 | |
| 12 # POD documentation - main docs before the code | |
| 13 | |
| 14 =head1 NAME | |
| 15 | |
| 16 Bio::Tools::Sim4::Results - Results of one Sim4 run | |
| 17 | |
| 18 =head1 SYNOPSIS | |
| 19 | |
| 20 # to preset the order of EST and genomic file as given on the sim4 | |
| 21 # command line: | |
| 22 my $sim4 = Bio::Tools::Sim4::Results->new(-file => 'result.sim4', | |
| 23 -estfirst => 1); | |
| 24 # to let the order be determined automatically (by length comparison): | |
| 25 $sim4 = Bio::Tools::Sim4::Results->new( -file => 'sim4.results' ); | |
| 26 # filehandle: | |
| 27 $sim4 = Bio::Tools::Sim4::Results->new( -fh => \*INPUT ); | |
| 28 | |
| 29 # parse the results | |
| 30 while(my $exonset = $sim4->next_exonset()) { | |
| 31 # $exonset is-a Bio::SeqFeature::Generic with Bio::Tools::Sim4::Exons | |
| 32 # as sub features | |
| 33 print "Delimited on sequence ", $exonset->seq_id(), | |
| 34 "from ", $exonset->start(), " to ", $exonset->end(), "\n"; | |
| 35 foreach my $exon ( $exonset->sub_SeqFeature() ) { | |
| 36 # $exon is-a Bio::SeqFeature::FeaturePair | |
| 37 print "Exon from ", $exon->start, " to ", $exon->end, | |
| 38 " on strand ", $exon->strand(), "\n"; | |
| 39 # you can get out what it matched using the est_hit attribute | |
| 40 my $homol = $exon->est_hit(); | |
| 41 print "Matched to sequence ", $homol->seq_id, | |
| 42 " at ", $homol->start," to ", $homol->end, "\n"; | |
| 43 } | |
| 44 } | |
| 45 | |
| 46 # essential if you gave a filename at initialization (otherwise the file | |
| 47 # stays open) | |
| 48 $sim4->close(); | |
| 49 | |
| 50 =head1 DESCRIPTION | |
| 51 | |
| 52 The sim4 module provides a parser and results object for sim4 output. The | |
| 53 sim4 results are specialised types of SeqFeatures, meaning you can add them | |
| 54 to AnnSeq objects fine, and manipulate them in the "normal" seqfeature manner. | |
| 55 | |
| 56 The sim4 Exon objects are Bio::SeqFeature::FeaturePair inherited objects. The | |
| 57 $esthit = $exon-E<gt>est_hit() is the alignment as a feature on the matching | |
| 58 object (normally, an EST), in which the start/end points are where the hit | |
| 59 lies. | |
| 60 | |
| 61 To make this module work sensibly you need to run | |
| 62 | |
| 63 sim4 genomic.fasta est.database.fasta | |
| 64 or | |
| 65 sim4 est.fasta genomic.database.fasta | |
| 66 | |
| 67 To get the sequence identifiers recorded for the first sequence, too, use | |
| 68 A=4 as output option for sim4. | |
| 69 | |
| 70 One fiddle here is that there are only two real possibilities to the matching | |
| 71 criteria: either one sequence needs reversing or not. Because of this, it | |
| 72 is impossible to tell whether the match is in the forward or reverse strand | |
| 73 of the genomic DNA. We solve this here by assuming that the genomic DNA is | |
| 74 always forward. As a consequence, the strand attribute of the matching EST is | |
| 75 unknown, and the strand attribute of the genomic DNA (i.e., the Exon object) | |
| 76 will reflect the direction of the hit. | |
| 77 | |
| 78 See the documentation of parse_next_alignment() for abilities of the parser | |
| 79 to deal with the different output format options of sim4. | |
| 80 | |
| 81 =head1 FEEDBACK | |
| 82 | |
| 83 =head2 Mailing Lists | |
| 84 | |
| 85 User feedback is an integral part of the evolution of this and other | |
| 86 Bioperl modules. Send your comments and suggestions preferably to one | |
| 87 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 88 | |
| 89 bioperl-l@bioperl.org - General discussion | |
| 90 http://bio.perl.org/MailList.html - About the mailing lists | |
| 91 | |
| 92 =head2 Reporting Bugs | |
| 93 | |
| 94 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 95 the bugs and their resolution. Bug reports can be submitted via email | |
| 96 or the web: | |
| 97 | |
| 98 bioperl-bugs@bio.perl.org | |
| 99 http://bugzilla.bioperl.org/ | |
| 100 | |
| 101 =head1 AUTHOR - Ewan Birney, Hilmar Lapp | |
| 102 | |
| 103 Email birney@sanger.ac.uk | |
| 104 hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com) | |
| 105 | |
| 106 Describe contact details here | |
| 107 | |
| 108 =head1 APPENDIX | |
| 109 | |
| 110 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
| 111 | |
| 112 =cut | |
| 113 | |
| 114 | |
| 115 # Let the code begin... | |
| 116 | |
| 117 | |
| 118 package Bio::Tools::Sim4::Results; | |
| 119 use vars qw(@ISA); | |
| 120 use strict; | |
| 121 | |
| 122 # Object preamble - inherits from Bio::Root::Object | |
| 123 | |
| 124 use File::Basename; | |
| 125 use Bio::Root::Root; | |
| 126 use Bio::Tools::AnalysisResult; | |
| 127 use Bio::Tools::Sim4::Exon; | |
| 128 | |
| 129 @ISA = qw(Bio::Tools::AnalysisResult); | |
| 130 | |
| 131 | |
| 132 sub _initialize_state { | |
| 133 my($self,@args) = @_; | |
| 134 | |
| 135 # call the inherited method first | |
| 136 my $make = $self->SUPER::_initialize_state(@args); | |
| 137 | |
| 138 my ($est_is_first) = $self->_rearrange([qw(ESTFIRST)], @args); | |
| 139 | |
| 140 delete($self->{'_est_is_first'}); | |
| 141 $self->{'_est_is_first'} = $est_is_first if(defined($est_is_first)); | |
| 142 $self->analysis_method("Sim4"); | |
| 143 } | |
| 144 | |
| 145 =head2 analysis_method | |
| 146 | |
| 147 Usage : $sim4->analysis_method(); | |
| 148 Purpose : Inherited method. Overridden to ensure that the name matches | |
| 149 /sim4/i. | |
| 150 Returns : String | |
| 151 Argument : n/a | |
| 152 | |
| 153 =cut | |
| 154 | |
| 155 #------------- | |
| 156 sub analysis_method { | |
| 157 #------------- | |
| 158 my ($self, $method) = @_; | |
| 159 if($method && ($method !~ /sim4/i)) { | |
| 160 $self->throw("method $method not supported in " . ref($self)); | |
| 161 } | |
| 162 return $self->SUPER::analysis_method($method); | |
| 163 } | |
| 164 | |
| 165 =head2 parse_next_alignment | |
| 166 | |
| 167 Title : parse_next_alignment | |
| 168 Usage : @exons = $sim4_result->parse_next_alignment; | |
| 169 foreach $exon (@exons) { | |
| 170 # do something | |
| 171 } | |
| 172 Function: Parses the next alignment of the Sim4 result file and returns the | |
| 173 found exons as an array of Bio::Tools::Sim4::Exon objects. Call | |
| 174 this method repeatedly until an empty array is returned to get the | |
| 175 results for all alignments. | |
| 176 | |
| 177 The $exon->seq_id() attribute will be set to the identifier of the | |
| 178 respective sequence for both sequences if A=4 was used in the sim4 | |
| 179 run, and otherwise for the second sequence only. If the output does | |
| 180 not contain the identifier, the filename stripped of path and | |
| 181 extension is used instead. In addition, the full filename | |
| 182 will be recorded for both features ($exon inherits off | |
| 183 Bio::SeqFeature::SimilarityPair) as tag 'filename'. The length | |
| 184 is accessible via the seqlength() attribute of $exon->query() and | |
| 185 $exon->est_hit(). | |
| 186 | |
| 187 Note that this method is capable of dealing with outputs generated | |
| 188 with format 0,1,3, and 4 (via the A=n option to sim4). It | |
| 189 automatically determines which of the two sequences has been | |
| 190 reversed, and adjusts the coordinates for that sequence. It will | |
| 191 also detect whether the EST sequence(s) were given as first or as | |
| 192 second file to sim4, unless this has been specified at creation | |
| 193 time of the object. | |
| 194 | |
| 195 Example : | |
| 196 Returns : An array of Bio::Tools::Sim4::Exon objects | |
| 197 Args : | |
| 198 | |
| 199 | |
| 200 =cut | |
| 201 | |
| 202 sub parse_next_alignment { | |
| 203 my ($self) = @_; | |
| 204 my @exons = (); | |
| 205 my %seq1props = (); | |
| 206 my %seq2props = (); | |
| 207 # we refer to the properties of each seq by reference | |
| 208 my ($estseq, $genomseq, $to_reverse); | |
| 209 my $started = 0; | |
| 210 my $hit_direction = 1; | |
| 211 my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced | |
| 212 # output yet) | |
| 213 | |
| 214 while(defined($_ = $self->_readline())) { | |
| 215 #chomp(); | |
| 216 | |
| 217 # | |
| 218 # bascially, each sim4 'hit' starts with seq1... | |
| 219 # | |
| 220 /^seq1/ && do { | |
| 221 if($started) { | |
| 222 $self->_pushback($_); | |
| 223 last; | |
| 224 } | |
| 225 $started = 1; | |
| 226 | |
| 227 # filename and length of seq 1 | |
| 228 /^seq1\s+=\s+(\S+)\,\s+(\d+)/ || | |
| 229 $self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!"); | |
| 230 $seq1props{'filename'} = $1; | |
| 231 $seq1props{'length'} = $2; | |
| 232 next; | |
| 233 }; | |
| 234 /^seq2/ && do { | |
| 235 # the second hit has also the database name in the >name syntax | |
| 236 # (in brackets). | |
| 237 /^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/|| | |
| 238 $self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!"); | |
| 239 $seq2props{'filename'} = $1; | |
| 240 $seq2props{'seqname'} = $2; | |
| 241 $seq2props{'length'} = $3; | |
| 242 next; | |
| 243 }; | |
| 244 if(/^>(\S+)\s*(.*)$/) { | |
| 245 # output option was A=4, which not only gives the complete | |
| 246 # description lines, but also causes the longer sequence to be | |
| 247 # reversed if the second file contained one (genomic) sequence | |
| 248 $seq1props{'seqname'} = $1; | |
| 249 $seq1props{'description'} = $2 if $2; | |
| 250 $output_fmt = 4; | |
| 251 # we handle seq1 and seq2 both here | |
| 252 if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) { | |
| 253 $seq2props{'seqname'} = $1; # redundant, since already set above | |
| 254 $seq2props{'description'} = $2 if $2; | |
| 255 } | |
| 256 next; | |
| 257 } | |
| 258 /^\(complement\)/ && do { | |
| 259 $hit_direction = -1; | |
| 260 next; | |
| 261 }; | |
| 262 # this matches | |
| 263 # start-end (start-end) pctid% | |
| 264 if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) { | |
| 265 $seq1props{'start'} = $1; | |
| 266 $seq1props{'end'} = $2; | |
| 267 $seq2props{'start'} = $3; | |
| 268 $seq2props{'end'} = $4; | |
| 269 my $pctid = $5; | |
| 270 | |
| 271 if(! defined($estseq)) { | |
| 272 # for the first time here: need to set the references referring | |
| 273 # to seq1 and seq2 | |
| 274 if(! exists($self->{'_est_is_first'})) { | |
| 275 # detect which one is the EST by looking at the lengths, | |
| 276 # and assume that this holds throughout the entire result | |
| 277 # file (i.e., when this method is called for the next | |
| 278 # alignment, this will not be checked again) | |
| 279 if($seq1props{'length'} > $seq2props{'length'}) { | |
| 280 $self->{'_est_is_first'} = 0; | |
| 281 } else { | |
| 282 $self->{'_est_is_first'} = 1; | |
| 283 } | |
| 284 } | |
| 285 if($self->{'_est_is_first'}) { | |
| 286 $estseq = \%seq1props; | |
| 287 $genomseq = \%seq2props; | |
| 288 # if the EST is given first, A=4 selects the genomic | |
| 289 # seq for being reversed (reversing the EST is default) | |
| 290 $to_reverse = ($output_fmt == 4) ? $genomseq : $estseq; | |
| 291 } else { | |
| 292 $estseq = \%seq2props; | |
| 293 $genomseq = \%seq1props; | |
| 294 # if the EST is the second, A=4 does not change the | |
| 295 # seq being reversed (always the EST is reversed) | |
| 296 $to_reverse = $estseq; | |
| 297 } | |
| 298 } | |
| 299 if($hit_direction == -1) { | |
| 300 # we have to reverse the coordinates of one of both seqs | |
| 301 my $tmp = $to_reverse->{'start'}; | |
| 302 $to_reverse->{'start'} = | |
| 303 $to_reverse->{'length'} - $to_reverse->{'end'} + 1; | |
| 304 $to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1; | |
| 305 } | |
| 306 # create and initialize the exon object | |
| 307 my $exon = Bio::Tools::Sim4::Exon->new( | |
| 308 '-start' => $genomseq->{'start'}, | |
| 309 '-end' => $genomseq->{'end'}, | |
| 310 '-strand' => $hit_direction); | |
| 311 if(exists($genomseq->{'seqname'})) { | |
| 312 $exon->seq_id($genomseq->{'seqname'}); | |
| 313 } else { | |
| 314 # take filename stripped of path as fall back | |
| 315 my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*'); | |
| 316 $exon->seq_id($basename); | |
| 317 } | |
| 318 $exon->feature1()->add_tag_value('filename', | |
| 319 $genomseq->{'filename'}); | |
| 320 # feature1 is supposed to be initialized to a Similarity object, | |
| 321 # but we provide a safety net | |
| 322 if($exon->feature1()->can('seqlength')) { | |
| 323 $exon->feature1()->seqlength($genomseq->{'length'}); | |
| 324 } else { | |
| 325 $exon->feature1()->add_tag_value('SeqLength', | |
| 326 $genomseq->{'length'}); | |
| 327 } | |
| 328 # create and initialize the feature wrapping the 'hit' (the EST) | |
| 329 my $fea2 = Bio::SeqFeature::Similarity->new( | |
| 330 '-start' => $estseq->{'start'}, | |
| 331 '-end' => $estseq->{'end'}, | |
| 332 '-strand' => 0, | |
| 333 '-primary' => "aligning_EST"); | |
| 334 if(exists($estseq->{'seqname'})) { | |
| 335 $fea2->seq_id($estseq->{'seqname'}); | |
| 336 } else { | |
| 337 # take filename stripped of path as fall back | |
| 338 my ($basename) = | |
| 339 &File::Basename::fileparse($estseq->{'filename'}, '\..*'); | |
| 340 $fea2->seq_id($basename); | |
| 341 } | |
| 342 $fea2->add_tag_value('filename', $estseq->{'filename'}); | |
| 343 $fea2->seqlength($estseq->{'length'}); | |
| 344 # store | |
| 345 $exon->est_hit($fea2); | |
| 346 # general properties | |
| 347 $exon->source_tag($self->analysis_method()); | |
| 348 $exon->percentage_id($pctid); | |
| 349 $exon->score($exon->percentage_id()); | |
| 350 # push onto array | |
| 351 push(@exons, $exon); | |
| 352 next; # back to while loop | |
| 353 } | |
| 354 } | |
| 355 return @exons; | |
| 356 } | |
| 357 | |
| 358 =head2 next_exonset | |
| 359 | |
| 360 Title : next_exonset | |
| 361 Usage : $exonset = $sim4_result->parse_next_exonset; | |
| 362 print "Exons start at ", $exonset->start(), | |
| 363 "and end at ", $exonset->end(), "\n"; | |
| 364 foreach $exon ($exonset->sub_SeqFeature()) { | |
| 365 # do something | |
| 366 } | |
| 367 Function: Parses the next alignment of the Sim4 result file and returns the | |
| 368 set of exons as a container of features. The container is itself | |
| 369 a Bio::SeqFeature::Generic object, with the Bio::Tools::Sim4::Exon | |
| 370 objects as sub features. Start, end, and strand of the container | |
| 371 will represent the total region covered by the exons of this set. | |
| 372 | |
| 373 See the documentation of parse_next_alignment() for further | |
| 374 reference about parsing and how the information is stored. | |
| 375 | |
| 376 Example : | |
| 377 Returns : An Bio::SeqFeature::Generic object holding Bio::Tools::Sim4::Exon | |
| 378 objects as sub features. | |
| 379 Args : | |
| 380 | |
| 381 =cut | |
| 382 | |
| 383 sub next_exonset { | |
| 384 my $self = shift; | |
| 385 my $exonset; | |
| 386 | |
| 387 # get the next array of exons | |
| 388 my @exons = $self->parse_next_alignment(); | |
| 389 return if($#exons < 0); | |
| 390 # create the container of exons as a feature object itself, with the | |
| 391 # data of the first exon for initialization | |
| 392 $exonset = Bio::SeqFeature::Generic->new('-start' => $exons[0]->start(), | |
| 393 '-end' => $exons[0]->end(), | |
| 394 '-strand' => $exons[0]->strand(), | |
| 395 '-primary' => "ExonSet"); | |
| 396 $exonset->source_tag($exons[0]->source_tag()); | |
| 397 $exonset->seq_id($exons[0]->seq_id()); | |
| 398 # now add all exons as sub features, with enabling EXPANsion of the region | |
| 399 # covered in total | |
| 400 foreach my $exon (@exons) { | |
| 401 $exonset->add_sub_SeqFeature($exon, 'EXPAND'); | |
| 402 } | |
| 403 return $exonset; | |
| 404 } | |
| 405 | |
| 406 =head2 next_feature | |
| 407 | |
| 408 Title : next_feature | |
| 409 Usage : while($exonset = $sim4->next_feature()) { | |
| 410 # do something | |
| 411 } | |
| 412 Function: Does the same as L<next_exonset()>. See there for documentation of | |
| 413 the functionality. Call this method repeatedly until FALSE is | |
| 414 returned. | |
| 415 | |
| 416 The returned object is actually a SeqFeatureI implementing object. | |
| 417 This method is required for classes implementing the | |
| 418 SeqAnalysisParserI interface, and is merely an alias for | |
| 419 next_exonset() at present. | |
| 420 | |
| 421 Example : | |
| 422 Returns : A Bio::SeqFeature::Generic object. | |
| 423 Args : | |
| 424 | |
| 425 =cut | |
| 426 | |
| 427 sub next_feature { | |
| 428 my ($self,@args) = @_; | |
| 429 # even though next_exonset doesn't expect any args (and this method | |
| 430 # does neither), we pass on args in order to be prepared if this changes | |
| 431 # ever | |
| 432 return $self->next_exonset(@args); | |
| 433 } | |
| 434 | |
| 435 1; |
